Exemplo n.º 1
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print("type D: ", type(D))
    print("type x_train: ", type(X_train))
    print("type x_test: ", type(X_test))
    print("type distance: ", type(distance))
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    X_train = dataset_transform[distance](X_train)
    X_test = dataset_transform[distance](X_test)

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(algo, X_train, X_test,
                                                       distance, count,
                                                       run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm,
                                                    batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition, query_arguments,
                          descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 2
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
        or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                  (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            descriptor, results = run_individual_query(
                algo, X_train, X_test, distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(
                definition.algorithm, batch)
            descriptor["dataset"] = dataset
            store_results(dataset, count, definition,
                          query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 3
0
def run(definition, dataset, count, run_count, batch):
    algo = instantiate_algorithm(definition)
    assert not definition.query_argument_groups \
            or hasattr(algo, "set_query_arguments"), """\
error: query argument groups have been specified for %s.%s(%s), but the \
algorithm instantiated from it does not implement the set_query_arguments \
function""" % (definition.module, definition.constructor, definition.arguments)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    if algo.builds_graph():
        # Test data first to avoid converting test set index to graph index
        X_train = numpy.concatenate((X_test, X_train))
        # The protocol expects the count to be given at query time, so it has
        # to be set as a parameter beforehand.
        algo.set_count(count)
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        prepared_queries = False
        if hasattr(algo, "supports_prepared_queries"):
            prepared_queries = algo.supports_prepared_queries()

        t0 = time.time()
        memory_usage_before = algo.get_memory_usage()
        algo.fit(X_train)

        build_time = time.time() - t0
        index_size = algo.get_memory_usage() - memory_usage_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        query_argument_groups = definition.query_argument_groups
        # Make sure that algorithms with no query argument groups still get run
        # once by providing them with a single, empty, harmless group
        if not query_argument_groups:
            query_argument_groups = [[]]

        for pos, query_arguments in enumerate(query_argument_groups, 1):
            print("Running query argument group %d of %d..." %
                    (pos, len(query_argument_groups)))
            if query_arguments:
                algo.set_query_arguments(*query_arguments)
            if algo.builds_graph():
                descriptor, results = check_graph(algo, X_train, X_test, distance, count)
            else:
                descriptor, results = run_individual_query(algo, X_train, X_test,
                    distance, count, run_count, batch)
            descriptor["build_time"] = build_time
            descriptor["index_size"] = index_size
            descriptor["algo"] = get_algorithm_name(definition.algorithm, batch)
            descriptor["dataset"] = dataset
            descriptor["count"] = int(count)
            descriptor["batch_mode"] = batch
            store_results(dataset, count, definition,
                    query_arguments, descriptor, results, batch)
    finally:
        algo.done()
Exemplo n.º 4
0
def run(definition,
        dataset,
        count,
        run_count=3,
        force_single=False,
        use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i + 1, run_count))
            n_items_processed = [
                0
            ]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in candidates
                ]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' %
                          (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print(
                        'warning: algorithm %s returned %d results, but count is only %d)'
                        % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[
                    (int(idx),
                     float(metrics[distance]['distance'](v, X_train[idx])))
                    for idx in single_results
                ] for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(
                len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
            "distance": distance,
            "count": int(count),
            "algo": definition.algorithm,
            "dataset": dataset
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()
Exemplo n.º 5
0
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False):
    algo = instantiate_algorithm(definition)

    D = get_dataset(dataset)
    X_train = numpy.array(D['train'])
    X_test = numpy.array(D['test'])
    distance = D.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    try:
        t0 = time.time()
        index_size_before = algo.get_index_size("self")
        algo.fit(X_train)
        build_time = time.time() - t0
        index_size = algo.get_index_size("self") - index_size_before
        print('Built index in', build_time)
        print('Index size: ', index_size)

        best_search_time = float('inf')
        for i in range(run_count):
            print('Run %d/%d...' % (i+1, run_count))
            n_items_processed = [0]  # a bit dumb but can't be a scalar since of Python's scoping rules

            def single_query(v):
                start = time.time()
                candidates = algo.query(v, count)
                total = (time.time() - start)
                candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                              for idx in candidates]
                n_items_processed[0] += 1
                if n_items_processed[0] % 1000 == 0:
                    print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0]))
                if len(candidates) > count:
                    print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count))
                return (total, candidates)

            def batch_query(X):
                start = time.time()
                result = algo.batch_query(X, count)
                total = (time.time() - start)
                candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
                               for idx in single_results]
                              for v, single_results in zip(X, results)]
                return [(total / float(len(X)), v) for v in candidates]

            if use_batch_query:
                results = batch_query(X_test)
            elif algo.use_threads() and not force_single:
                pool = multiprocessing.pool.ThreadPool()
                results = pool.map(single_query, X_test)
            else:
                results = [single_query(x) for x in X_test]

            total_time = sum(time for time, _ in results)
            total_candidates = sum(len(candidates) for _, candidates in results)
            search_time = total_time / len(X_test)
            avg_candidates = total_candidates / len(X_test)
            best_search_time = min(best_search_time, search_time)

        verbose = hasattr(algo, "query_verbose")
        attrs = {
            "batch_mode": use_batch_query,
            "build_time": build_time,
            "best_search_time": best_search_time,
            "candidates": avg_candidates,
            "expect_extra": verbose,
            "index_size": index_size,
            "name": algo.name,
            "run_count": run_count,
            "run_alone": force_single,
        }
        store_results(dataset, count, definition, attrs, results)
    finally:
        algo.done()