def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print("type D: ", type(D)) print("type x_train: ", type(X_train)) print("type x_test: ", type(X_test)) print("type distance: ", type(distance)) print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) X_train = dataset_transform[distance](X_train) X_test = dataset_transform[distance](X_test) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name( definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) if algo.builds_graph(): # Test data first to avoid converting test set index to graph index X_train = numpy.concatenate((X_test, X_train)) # The protocol expects the count to be given at query time, so it has # to be set as a parameter beforehand. algo.set_count(count) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) if algo.builds_graph(): descriptor, results = check_graph(algo, X_train, X_test, distance, count) else: descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset descriptor["count"] = int(count) descriptor["batch_mode"] = batch store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular') parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--batch', help='Provide Queryset as Batch', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print('The following algorithms are supported...') for point in definitions: print('\t... for the point type "%s"...' % point) for metric in definitions[point]: print('\t\t... and the distance metric "%s":' % metric) for algorithm in definitions[point][metric]: print('\t\t\t%s' % algorithm) sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) X_train = dataset['train'] X_test = dataset['test'] distance = dataset.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.count, distance): algos_already_run.add((run.attrs["library"], run.attrs["name"])) point_type = 'float' # TODO(erikbern): should look at the type of X_train algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, X_test, library, algo, distance, send_pipe, args.runs, args.single, args.batch)) p.start() send_pipe.close() timed_out = False try: r = recv_pipe.poll(args.timeout) if r: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them attrs, results = recv_pipe.recv() if "expect_extra" in attrs: if attrs["expect_extra"]: attrs["extra"] = recv_pipe.recv() del attrs["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(attrs, results, args.dataset, args.count, distance) elif timed_out: print('algorithm worker process took too long') else: print('algorithm worker process stopped unexpectedly')
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i + 1, run_count)) n_items_processed = [ 0 ] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates ] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print( 'warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results ] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum( len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, "distance": distance, "count": int(count), "algo": definition.algorithm, "dataset": dataset } store_results(dataset, count, definition, attrs, results) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help= 'load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help= 'the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size=manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path(args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i+1, run_count)) n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum(len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, } store_results(dataset, count, definition, attrs, results) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help='load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help='the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument( '--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size = manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path( args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process( target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"