Exemplo n.º 1
0
def load_all_results():
    """Read all result files and compute all metrics"""
    all_runs_by_dataset = {'batch': {}, 'non-batch': {}}
    all_runs_by_algorithm = {'batch': {}, 'non-batch': {}}
    cached_true_dist = []
    old_sdn = None
    for properties, f in results.load_all_results():
        sdn = get_run_desc(properties)
        if sdn != old_sdn:
            dataset = get_dataset(properties["dataset"])
            cached_true_dist = list(dataset["distances"])
            old_sdn = sdn
        algo = properties["algo"]
        ms = compute_all_metrics(cached_true_dist, f, properties,
                                 args.recompute)
        algo_ds = get_dataset_label(sdn)
        idx = "non-batch"
        if properties["batch_mode"]:
            idx = "batch"
        all_runs_by_algorithm[idx].setdefault(algo,
                                              {}).setdefault(algo_ds,
                                                             []).append(ms)
        all_runs_by_dataset[idx].setdefault(sdn, {}).setdefault(algo,
                                                                []).append(ms)

    return (all_runs_by_dataset, all_runs_by_algorithm)
def load_all_results():
    """Read all result files and compute all metrics"""
    all_runs_by_dataset = {'batch': {}, 'non-batch': {}}
    all_runs_by_algorithm = {'batch': {}, 'non-batch': {}}
    cached_true_dist = []
    old_sdn = None
    for mode in ["non-batch", "batch"]:
        for properties, f in results.load_all_results(
                batch_mode=(mode == "batch")):
            sdn = get_run_desc(properties)
            if sdn != old_sdn:
                dataset = get_dataset(properties["dataset"])
                cached_true_dist = list(dataset["distances"])
                old_sdn = sdn
            algo_ds = get_dataset_label(sdn)
            desc_suffix = ("-batch" if mode == "batch" else "")
            algo = properties["algo"] + desc_suffix
            sdn += desc_suffix
            ms = compute_all_metrics(cached_true_dist, f, properties,
                                     args.recompute)
            all_runs_by_algorithm[mode].setdefault(algo, {}).setdefault(
                algo_ds, []).append(ms)
            all_runs_by_dataset[mode].setdefault(sdn,
                                                 {}).setdefault(algo,
                                                                []).append(ms)

    return (all_runs_by_dataset, all_runs_by_algorithm)
Exemplo n.º 3
0
def load_all_results():
    """Read all result files and compute all metrics"""
    all_runs_by_dataset = {}
    all_runs_by_algorithm = {}
    cached_true_dist = []
    old_sdn = None
    for f in results.load_all_results():
        properties = dict(f.attrs)
        # TODO Fix this properly. Sometimes the hdf5 file returns bytes
        # This converts these bytes to strings before we work with them
        for k in properties.keys():
            try:
                properties[k]= properties[k].decode()
            except:
                pass
        sdn = get_run_desc(properties)
        if sdn != old_sdn:
            dataset = get_dataset(properties["dataset"])
            cached_true_dist = list(dataset["distances"])
            old_sdn = sdn
        algo = properties["algo"]
        ms = compute_all_metrics(cached_true_dist, f, properties["algo"])
        algo_ds = get_dataset_label(sdn)

        all_runs_by_algorithm.setdefault(algo, {}).setdefault(algo_ds, []).append(ms)
        all_runs_by_dataset.setdefault(sdn, {}).setdefault(algo, []).append(ms)
    return (all_runs_by_dataset, all_runs_by_algorithm)
Exemplo n.º 4
0
def load_all_results():
    """Read all result files and compute all metrics"""
    all_runs_by_dataset = {'batch': {}, 'non-batch': {}}
    all_runs_by_algorithm = {'batch': {}, 'non-batch': {}}
    cached_true_dist = []
    old_sdn = None
    for properties, f in results.load_all_results():
        sdn = get_run_desc(properties)
        if sdn != old_sdn:
            dataset = get_dataset(properties["dataset"])
            cached_true_dist = list(dataset["distances"])
            old_sdn = sdn
        algo = properties["algo"]
        ms = compute_all_metrics(
            cached_true_dist, f, properties, args.recompute)
        algo_ds = get_dataset_label(sdn)
        idx = "non-batch"
        if properties["batch_mode"]:
            idx = "batch"
        all_runs_by_algorithm[idx].setdefault(
            algo, {}).setdefault(algo_ds, []).append(ms)
        all_runs_by_dataset[idx].setdefault(
            sdn, {}).setdefault(algo, []).append(ms)

    return (all_runs_by_dataset, all_runs_by_algorithm)
Exemplo n.º 5
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print("type D: ", type(D))
    print("type x_train: ", type(X_train))
    print("type x_test: ", type(X_test))
    print("type distance: ", type(distance))
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    X_train = dataset_transform[distance](X_train)
    X_test = dataset_transform[distance](X_test)

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(algo, X_train, X_test,
                                                       distance, count,
                                                       run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm,
                                                    batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition, query_arguments,
                          descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 6
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(
                algo, X_train, X_test, distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(
                definition.algorithm, batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition,
                          query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 7
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
            or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    if algo.builds_graph():
        # Test data first to avoid converting test set index to graph index
        X_train = numpy.concatenate((X_test, X_train))
        # The protocol expects the count to be given at query time, so it has
        # to be set as a parameter beforehand.
        algo.set_count(count)
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)

        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                    (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            if algo.builds_graph():
                descriptor, results = check_graph(algo, X_train, X_test, distance, count)
            else:
                descriptor, results = run_individual_query(algo, X_train, X_test,
                    distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm, batch)
            descriptor["dataset"] = dataset
            descriptor["count"] = int(count)
            descriptor["batch_mode"] = batch
            store_results(dataset, count, definition,
                    query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular',
                        choices=DATASETS.keys())
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='re-run algorithms even if their results already exist',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only'
        ' the best result',
        default=5)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1'
        'if no timeout should be set',
        default=2 * 3600)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same '
        'process) rather than using Docker')
    parser.add_argument('--batch',
                        action='store_true',
                        help='If set, algorithms get all queries at once')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument('--run-disabled',
                        help='run algorithms that are disabled in algos.yml',
                        action='store_true')
    parser.add_argument('--parallelism',
                        type=positive_int,
                        help='Number of Docker containers in parallel',
                        default=1)

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    logging.config.fileConfig("logging.conf")
    logger = logging.getLogger("annb")

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset, dimension = get_dataset(args.dataset)
    point_type = dataset.attrs.get('point_type', 'float')
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset, args.count, definition,
                                     query_arguments, args.batch)
            if args.force or not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                    query_argument_groups=not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)

    if args.algorithm:
        logger.info(f'running only {args.algorithm}')
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag = tag.split(':')[0]
                docker_tags.add(tag)

        if args.docker_tag:
            logger.info(f'running only {args.docker_tag}')
            definitions = [
                d for d in definitions if d.docker_tag == args.docker_tag
            ]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            logger.info(
                f'not all docker images available, only: {set(docker_tags)}')
            logger.info(
                f'missing docker images: '
                f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}'
            )
            definitions = [
                d for d in definitions if d.docker_tag in docker_tags
            ]
    else:

        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor
            # of the right name, then the definition is broken
            if status == InstantiationStatus.NO_CONSTRUCTOR:
                raise Exception(
                    "%s.%s(%s): error: the module '%s' does not"
                    " expose the named constructor" %
                    (df.module, df.constructor, df.arguments, df.module))

            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because
                # of a missing dependency), print a warning and remove
                # this definition from the list of things to be run
                logging.warning(
                    "%s.%s(%s): the module '%s' could not be "
                    "loaded; skipping" %
                    (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True

        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            logger.info(
                f'Not running disabled algorithms {[d for d in definitions if d.disabled]}'
            )
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        logger.info(f'Order: {definitions}')

    if args.parallelism > multiprocessing.cpu_count() - 1:
        raise Exception('Parallelism larger than %d! (CPU count minus one)' %
                        (multiprocessing.cpu_count() - 1))

    # Multiprocessing magic to farm this out to all CPUs
    queue = multiprocessing.Queue()
    for definition in definitions:
        queue.put(definition)
    if args.batch and args.parallelism > 1:
        raise Exception(
            f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})"
        )
    workers = [
        multiprocessing.Process(target=run_worker, args=(i + 1, args, queue))
        for i in range(args.parallelism)
    ]
    [worker.start() for worker in workers]
    [worker.join() for worker in workers]
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular')
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--batch',
                        help='Provide Queryset as Batch',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print('The following algorithms are supported...')
        for point in definitions:
            print('\t... for the point type "%s"...' % point)
            for metric in definitions[point]:
                print('\t\t... and the distance metric "%s":' % metric)
                for algorithm in definitions[point][metric]:
                    print('\t\t\t%s' % algorithm)
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    X_train = dataset['train']
    X_test = dataset['test']
    distance = dataset.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.count, distance):
            algos_already_run.add((run.attrs["library"], run.attrs["name"]))

    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, X_test, library,
                                          algo, distance, send_pipe, args.runs,
                                          args.single, args.batch))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            r = recv_pipe.poll(args.timeout)
            if r:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                attrs, results = recv_pipe.recv()
                if "expect_extra" in attrs:
                    if attrs["expect_extra"]:
                        attrs["extra"] = recv_pipe.recv()
                    del attrs["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(attrs, results, args.dataset, args.count, distance)
        elif timed_out:
            print('algorithm worker process took too long')
        else:
            print('algorithm worker process stopped unexpectedly')
Exemplo n.º 10
0
                        default='linear')
    parser.add_argument(
        '--raw',
        help='Show raw results (not just Pareto frontier) in faded colours',
        action='store_true')
    parser.add_argument('--batch',
                        help='Plot runs in batch mode',
                        action='store_true')
    parser.add_argument('--recompute',
                        help='Clears the cache and recomputes the metrics',
                        action='store_true')
    args = parser.parse_args()

    if not args.output:
        args.output = 'results/%s.png' % (args.dataset +
                                          ('-batch' if args.batch else ''))
        print('writing output to %s' % args.output)

    dataset, _ = get_dataset(args.dataset)
    count = int(args.count)
    unique_algorithms = get_unique_algorithms()
    results = load_all_results(args.dataset, count, args.batch)
    linestyles = create_linestyles(sorted(unique_algorithms))
    runs = compute_metrics(np.array(dataset["distances"]), results,
                           args.x_axis, args.y_axis, args.recompute)
    if not runs:
        raise Exception('Nothing to plot')

    create_plot(runs, args.raw, args.x_scale, args.y_scale, args.x_axis,
                args.y_axis, args.output, linestyles, args.batch)
Exemplo n.º 11
0
def run(definition,
        dataset,
        count,
        run_count=3,
        force_single=False,
        use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i + 1, run_count))
            n_items_processed = [
                0
            ]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in candidates
                ]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' %
                          (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print(
                        'warning: algorithm %s returned %d results, but count is only %d)'
                        % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in single_results
                ] for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(
                len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
            "distance": distance,
            "count": int(count),
            "algo": definition.algorithm,
            "dataset": dataset
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
            '--dataset',
            metavar='NAME',
            help='the dataset to load training points from',
            default='glove')
    parser.add_argument(
            '--query-dataset',
            metavar='NAME',
            help='load query points from another dataset instead of choosing them randomly from the training dataset',
            default=None)
    parser.add_argument(
            "-k", "--count",
            default=10,
            type=positive_int,
            help="the number of near neighbours to search for")
    parser.add_argument(
            '--distance',
            help='the metric used to calculate the distance between points',
            default='angular')
    parser.add_argument(
            '--limit',
            help='the maximum number of points to load from the dataset, or -1 to load all of them',
            type=int,
            default=-1)
    parser.add_argument(
            '--definitions',
            metavar='FILE',
            help='load algorithm definitions from FILE',
            default='algos.yaml')
    parser.add_argument(
            '--algorithm',
            metavar='NAME',
            help='run only the named algorithm',
            default=None)
    parser.add_argument(
            '--sub-algorithm',
            metavar='NAME',
            help='run only the named instance of an algorithm (requires --algo)',
            default=None)
    parser.add_argument(
            '--list-algorithms',
            help='print the names of all known algorithms and exit',
            action='store_true',
            default=argparse.SUPPRESS)
    parser.add_argument(
            '--force',
            help='''re-run algorithms even if their results already exist''',
            action='store_true')
    parser.add_argument(
            '--runs',
            metavar='COUNT',
            type=positive_int,
            help='run each algorithm instance %(metavar)s times and use only the best result',
            default=3)
    parser.add_argument(
            '--timeout',
            type=int,
            help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
            default=-1)
    parser.add_argument(
            '--single',
            help='run only a single algorithm instance at a time',
            action='store_true')
    parser.add_argument(
            '--no_save_index',
            help='do not save indices',
            action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
                X, test_size = manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(
        args.dataset, args.count, args.limit, args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors,
        len(X_train[0]), point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(
            target=run_algo,
            args=(args.count, X_train, queries, library, algo,
                  args.distance, send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit,
                    args.count, args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"
Exemplo n.º 13
0
        help='Plot runs in batch mode',
        action='store_true')
    parser.add_argument(
        '--output',
        help='Path to the output csv file')
    parser.add_argument(
        '--recompute',
        action='store_true',
        help='Path to the output csv file')
    args = parser.parse_args()

    count = int(args.count)
    rows = []
    for dataset_name in datasets:
        print("Looking at dataset", dataset_name)
        dataset = get_dataset(dataset_name)
        unique_algorithms = get_unique_algorithms()
        print('Loading results')
        results = load_all_results(dataset_name, count, True, args.batch)
        print('... done')
        results = compute_metrics_all_runs(list(dataset["distances"]), results, args.recompute)
        rows.extend(results)
    print('Build dataframe')
    data = pd.DataFrame(rows)
    print('... done')
    print(data.groupby(['dataset', 'count', 'algorithm', 'parameters']).count())
    with open(args.output, 'w') as fp:
        data.to_csv(fp, index=False)


Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset',
        metavar='NAME',
        help='the dataset to load training points from',
        default='glove-100-angular',
        choices=DATASETS.keys())
    parser.add_argument(
        "-k", "--count",
        default=10,
        type=positive_int,
        help="the number of near neighbours to search for")
    parser.add_argument(
        '--definitions',
        metavar='FILE',
        help='load algorithm definitions from FILE',
        default='algos.yaml')
    parser.add_argument(
        '--algorithm',
        metavar='NAME',
        help='run only the named algorithm',
        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same process) rather than using Docker')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument(
        '--run-disabled',
        help='run algorithms that are disabled in algos.yml',
        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset,
                    args.count, definition, query_arguments)
            if not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                        query_argument_groups = not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)
    
    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag, _ = tag.split(':')
                docker_tags.add(tag)

        if args.docker_tag:
            print('running only', args.docker_tag)
            definitions = [d for d in definitions if d.docker_tag == args.docker_tag]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            print('not all docker images available, only:', set(docker_tags))
            print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags))
            definitions = [d for d in definitions if d.docker_tag in docker_tags]
    else:
        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor of
            # the right name, then the definition is broken
            assert status != InstantiationStatus.NO_CONSTRUCTOR, """\
%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module)
            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because of a missing
                # dependency), print a warning and remove this definition from the
                # list of things to be run
                print("""\
%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True
        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            print('Not running disabled algorithms:', [d for d in definitions if d.disabled])
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        print('Order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()
Exemplo n.º 15
0
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i+1, run_count))
            n_items_processed = [0]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                              for idx in candidates]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                               for idx in single_results]
                              for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset',
        metavar='NAME',
        help='the dataset to load training points from',
        default='glove-100-angular',
        choices=DATASETS.keys())
    parser.add_argument(
        "-k", "--count",
        default=10,
        type=positive_int,
        help="the number of near neighbours to search for")
    parser.add_argument(
        '--definitions',
        metavar='FILE',
        help='load algorithm definitions from FILE',
        default='algos.yaml')
    parser.add_argument(
        '--algorithm',
        metavar='NAME',
        help='run only the named algorithm',
        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same process) rather than using Docker')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if hasattr(args, "list_algorithms"):
        list_algorithms(args.definitions)
        sys.exit(0)

    # See which Docker images we have available
    docker_client = docker.from_env()
    docker_tags = set()
    for image in docker_client.images.list():
        for tag in image.tags:
            tag, _ = tag.split(':')
            docker_tags.add(tag)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count)

    # TODO(erikbern): should make this a helper function somewhere
    definitions = [definition for definition in definitions if not os.path.exists(get_result_filename(args.dataset, args.count, definition))]

    random.shuffle(definitions)
    
    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if args.docker_tag:
        print('running only', args.docker_tag)
        definitions = [d for d in definitions if d.docker_tag == args.docker_tag]

    if set(d.docker_tag for d in definitions).difference(docker_tags):
        print('not all docker images available, only:', set(docker_tags))
        print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags))
        definitions = [d for d in definitions if d.docker_tag in docker_tags]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    print('order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular',
                        choices=DATASETS.keys())
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=2)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help=
        'If set, then will run everything locally (inside the same process) rather than using Docker'
    )
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument('--run-disabled',
                        help='run algorithms that are disabled in algos.yml',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0])  # TODO(erikbern): ugly
    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset, args.count, definition,
                                     query_arguments)
            if not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                    query_argument_groups=not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)

    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag, _ = tag.split(':')
                docker_tags.add(tag)

        if args.docker_tag:
            print('running only', args.docker_tag)
            definitions = [
                d for d in definitions if d.docker_tag == args.docker_tag
            ]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            print('not all docker images available, only:', set(docker_tags))
            print(
                'missing docker images:',
                set(d.docker_tag for d in definitions).difference(docker_tags))
            definitions = [
                d for d in definitions if d.docker_tag in docker_tags
            ]
    else:

        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor of
            # the right name, then the definition is broken
            assert status != InstantiationStatus.NO_CONSTRUCTOR, """\
%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (
                df.module, df.constructor, df.arguments, df.module)
            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because of a missing
                # dependency), print a warning and remove this definition from the
                # list of things to be run
                print("""\
%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" %
                      (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True

        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            print('Not running disabled algorithms:',
                  [d for d in definitions if d.disabled])
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        print('Order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()
Exemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove')
    parser.add_argument(
        '--query-dataset',
        metavar='NAME',
        help=
        'load query points from another dataset instead of choosing them randomly from the training dataset',
        default=None)
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument(
        '--distance',
        help='the metric used to calculate the distance between points',
        default='angular')
    parser.add_argument(
        '--limit',
        help=
        'the maximum number of points to load from the dataset, or -1 to load all of them',
        type=int,
        default=-1)
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
            X, test_size=manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(args.dataset, args.count, args.limit,
                                      args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                               args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, queries,
                                          library, algo, args.distance,
                                          send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit, args.count,
                          args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"
Exemplo n.º 19
0
                        action='store_true')
    parser.add_argument('-Y',
                        '--y-log',
                        help='Draw the Y-axis using a logarithmic scale',
                        action='store_true')
    parser.add_argument(
        '--raw',
        help='Show raw results (not just Pareto frontier) in faded colours',
        action='store_true')
    args = parser.parse_args()

    if not args.output:
        args.output = 'results/%s.png' % args.dataset
        print('writing output to %s' % args.output)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0])  # TODO(erikbern): ugly
    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    count = int(args.count)
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, count)
    unique_algorithms = get_unique_algorithms(args.definitions)
    linestyles = create_linestyles(unique_algorithms)
    results = load_results(args.dataset, count, definitions)
    runs = compute_metrics(list(dataset["distances"]), results, args.x_axis,
                           args.y_axis)
    if not runs:
        raise Exception('Nothing to plot')

    create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis,
Exemplo n.º 20
0
        action='store_true')
    parser.add_argument(
        '-Y', '--y-log',
        help='Draw the Y-axis using a logarithmic scale',
        action='store_true')
    parser.add_argument(
        '--raw',
        help='Show raw results (not just Pareto frontier) in faded colours',
        action='store_true')
    args = parser.parse_args()

    if not args.output:
        args.output = 'results/%s.png' % args.dataset
        print('writing output to %s' % args.output)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    count = int(args.count)
    definitions = get_definitions(args.definitions, dimension, point_type, distance, count)
    unique_algorithms = get_unique_algorithms(args.definitions)
    linestyles = create_linestyles(unique_algorithms)
    results = load_results(args.dataset, count, definitions)
    runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis)
    if not runs:
        raise Exception('Nothing to plot')

    create_plot(runs, args.raw, args.x_log,
            args.y_log, args.x_axis, args.y_axis, args.output, linestyles)