Python get_definitions示例

编程语言: Python

命名空间/包名称: ann_benchmarks.algorithms.definitions

方法/功能: get_definitions

hotexamples.com的示例: 10

Python get_definitions - 已找到10个示例。这些是从开源项目中提取的最受好评的ann_benchmarks.algorithms.definitions.get_definitions现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test.py 项目： mindis/ann-benchmarks-1

def test_all_algos():
    definitions = get_definitions("algos.yaml")
    for metric in ['angular', 'euclidean']:
        algos = get_algorithms(definitions, constructors, len(X[0]), "float",
                               metric, 10)
        for algo_key in algos.keys():
            algo = random.choice(algos[algo_key])  # Just pick one of each
            yield check_algo, algo.name, algo  # pass name just so unittest can capture it

示例#2

显示文件

                        '--y-log',
                        help='Draw the Y-axis using a logarithmic scale',
                        action='store_true')
    parser.add_argument(
        '--raw',
        help='Show raw results (not just Pareto frontier) in faded colours',
        action='store_true')
    args = parser.parse_args()

    if not args.output:
        args.output = 'results/%s.png' % args.dataset
        print('writing output to %s' % args.output)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0])  # TODO(erikbern): ugly
    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    count = int(args.count)
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, count)
    unique_algorithms = get_unique_algorithms(args.definitions)
    linestyles = create_linestyles(unique_algorithms)
    results = load_results(args.dataset, count, definitions)
    runs = compute_metrics(list(dataset["distances"]), results, args.x_axis,
                           args.y_axis)
    if not runs:
        raise Exception('Nothing to plot')

    create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis,
                args.y_axis, args.output, linestyles)

示例#3

显示文件

文件： main.py 项目： maumueller/ann-benchmarks-1

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular',
                        choices=DATASETS.keys())
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=2)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help=
        'If set, then will run everything locally (inside the same process) rather than using Docker'
    )
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument('--run-disabled',
                        help='run algorithms that are disabled in algos.yml',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0])  # TODO(erikbern): ugly
    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset, args.count, definition,
                                     query_arguments)
            if not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                    query_argument_groups=not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)

    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag, _ = tag.split(':')
                docker_tags.add(tag)

        if args.docker_tag:
            print('running only', args.docker_tag)
            definitions = [
                d for d in definitions if d.docker_tag == args.docker_tag
            ]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            print('not all docker images available, only:', set(docker_tags))
            print(
                'missing docker images:',
                set(d.docker_tag for d in definitions).difference(docker_tags))
            definitions = [
                d for d in definitions if d.docker_tag in docker_tags
            ]
    else:

        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor of
            # the right name, then the definition is broken
            assert status != InstantiationStatus.NO_CONSTRUCTOR, """\
%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (
                df.module, df.constructor, df.arguments, df.module)
            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because of a missing
                # dependency), print a warning and remove this definition from the
                # list of things to be run
                print("""\
%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" %
                      (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True

        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            print('Not running disabled algorithms:',
                  [d for d in definitions if d.disabled])
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        print('Order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()

示例#4

显示文件

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular',
                        choices=DATASETS.keys())
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='re-run algorithms even if their results already exist',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only'
        ' the best result',
        default=5)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1'
        'if no timeout should be set',
        default=2 * 3600)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same '
        'process) rather than using Docker')
    parser.add_argument('--batch',
                        action='store_true',
                        help='If set, algorithms get all queries at once')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument('--run-disabled',
                        help='run algorithms that are disabled in algos.yml',
                        action='store_true')
    parser.add_argument('--parallelism',
                        type=positive_int,
                        help='Number of Docker containers in parallel',
                        default=1)

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    logging.config.fileConfig("logging.conf")
    logger = logging.getLogger("annb")

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset, dimension = get_dataset(args.dataset)
    point_type = dataset.attrs.get('point_type', 'float')
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type,
                                  distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset, args.count, definition,
                                     query_arguments, args.batch)
            if args.force or not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                    query_argument_groups=not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)

    if args.algorithm:
        logger.info(f'running only {args.algorithm}')
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag = tag.split(':')[0]
                docker_tags.add(tag)

        if args.docker_tag:
            logger.info(f'running only {args.docker_tag}')
            definitions = [
                d for d in definitions if d.docker_tag == args.docker_tag
            ]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            logger.info(
                f'not all docker images available, only: {set(docker_tags)}')
            logger.info(
                f'missing docker images: '
                f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}'
            )
            definitions = [
                d for d in definitions if d.docker_tag in docker_tags
            ]
    else:

        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor
            # of the right name, then the definition is broken
            if status == InstantiationStatus.NO_CONSTRUCTOR:
                raise Exception(
                    "%s.%s(%s): error: the module '%s' does not"
                    " expose the named constructor" %
                    (df.module, df.constructor, df.arguments, df.module))

            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because
                # of a missing dependency), print a warning and remove
                # this definition from the list of things to be run
                logging.warning(
                    "%s.%s(%s): the module '%s' could not be "
                    "loaded; skipping" %
                    (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True

        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            logger.info(
                f'Not running disabled algorithms {[d for d in definitions if d.disabled]}'
            )
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        logger.info(f'Order: {definitions}')

    if args.parallelism > multiprocessing.cpu_count() - 1:
        raise Exception('Parallelism larger than %d! (CPU count minus one)' %
                        (multiprocessing.cpu_count() - 1))

    # Multiprocessing magic to farm this out to all CPUs
    queue = multiprocessing.Queue()
    for definition in definitions:
        queue.put(definition)
    if args.batch and args.parallelism > 1:
        raise Exception(
            f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})"
        )
    workers = [
        multiprocessing.Process(target=run_worker, args=(i + 1, args, queue))
        for i in range(args.parallelism)
    ]
    [worker.start() for worker in workers]
    [worker.join() for worker in workers]

示例#5

显示文件

文件： main.py 项目： RaffEdwardBAH/ann-benchmarks

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset',
        metavar='NAME',
        help='the dataset to load training points from',
        default='glove-100-angular',
        choices=DATASETS.keys())
    parser.add_argument(
        "-k", "--count",
        default=10,
        type=positive_int,
        help="the number of near neighbours to search for")
    parser.add_argument(
        '--definitions',
        metavar='FILE',
        help='load algorithm definitions from FILE',
        default='algos.yaml')
    parser.add_argument(
        '--algorithm',
        metavar='NAME',
        help='run only the named algorithm',
        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same process) rather than using Docker')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if hasattr(args, "list_algorithms"):
        list_algorithms(args.definitions)
        sys.exit(0)

    # See which Docker images we have available
    docker_client = docker.from_env()
    docker_tags = set()
    for image in docker_client.images.list():
        for tag in image.tags:
            tag, _ = tag.split(':')
            docker_tags.add(tag)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count)

    # TODO(erikbern): should make this a helper function somewhere
    definitions = [definition for definition in definitions if not os.path.exists(get_result_filename(args.dataset, args.count, definition))]

    random.shuffle(definitions)
    
    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if args.docker_tag:
        print('running only', args.docker_tag)
        definitions = [d for d in definitions if d.docker_tag == args.docker_tag]

    if set(d.docker_tag for d in definitions).difference(docker_tags):
        print('not all docker images available, only:', set(docker_tags))
        print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags))
        definitions = [d for d in definitions if d.docker_tag in docker_tags]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    print('order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()

示例#6

显示文件

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove-100-angular')
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--batch',
                        help='Provide Queryset as Batch',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print('The following algorithms are supported...')
        for point in definitions:
            print('\t... for the point type "%s"...' % point)
            for metric in definitions[point]:
                print('\t\t... and the distance metric "%s":' % metric)
                for algorithm in definitions[point][metric]:
                    print('\t\t\t%s' % algorithm)
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    X_train = dataset['train']
    X_test = dataset['test']
    distance = dataset.attrs['distance']
    print('got a train set of size (%d * %d)' % X_train.shape)
    print('got %d queries' % len(X_test))

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.count, distance):
            algos_already_run.add((run.attrs["library"], run.attrs["name"]))

    point_type = 'float'  # TODO(erikbern): should look at the type of X_train
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, X_test, library,
                                          algo, distance, send_pipe, args.runs,
                                          args.single, args.batch))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            r = recv_pipe.poll(args.timeout)
            if r:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                attrs, results = recv_pipe.recv()
                if "expect_extra" in attrs:
                    if attrs["expect_extra"]:
                        attrs["extra"] = recv_pipe.recv()
                    del attrs["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(attrs, results, args.dataset, args.count, distance)
        elif timed_out:
            print('algorithm worker process took too long')
        else:
            print('algorithm worker process stopped unexpectedly')

示例#7

显示文件

文件： plot.py 项目： yurymalkov/ann-benchmarks

        action='store_true')
    parser.add_argument(
        '-Y', '--y-log',
        help='Draw the Y-axis using a logarithmic scale',
        action='store_true')
    parser.add_argument(
        '--raw',
        help='Show raw results (not just Pareto frontier) in faded colours',
        action='store_true')
    args = parser.parse_args()

    if not args.output:
        args.output = 'results/%s.png' % args.dataset
        print('writing output to %s' % args.output)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    count = int(args.count)
    definitions = get_definitions(args.definitions, dimension, point_type, distance, count)
    unique_algorithms = get_unique_algorithms(args.definitions)
    linestyles = create_linestyles(unique_algorithms)
    results = load_results(args.dataset, count, definitions)
    runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis)
    if not runs:
        raise Exception('Nothing to plot')

    create_plot(runs, args.raw, args.x_log,
            args.y_log, args.x_axis, args.y_axis, args.output, linestyles)

示例#8

显示文件

文件： main.py 项目： mindis/ann-benchmarks-1

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        metavar='NAME',
                        help='the dataset to load training points from',
                        default='glove')
    parser.add_argument(
        '--query-dataset',
        metavar='NAME',
        help=
        'load query points from another dataset instead of choosing them randomly from the training dataset',
        default=None)
    parser.add_argument("-k",
                        "--count",
                        default=10,
                        type=positive_int,
                        help="the number of near neighbours to search for")
    parser.add_argument(
        '--distance',
        help='the metric used to calculate the distance between points',
        default='angular')
    parser.add_argument(
        '--limit',
        help=
        'the maximum number of points to load from the dataset, or -1 to load all of them',
        type=int,
        default=-1)
    parser.add_argument('--definitions',
                        metavar='FILE',
                        help='load algorithm definitions from FILE',
                        default='algos.yaml')
    parser.add_argument('--algorithm',
                        metavar='NAME',
                        help='run only the named algorithm',
                        default=None)
    parser.add_argument(
        '--sub-algorithm',
        metavar='NAME',
        help='run only the named instance of an algorithm (requires --algo)',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true',
        default=argparse.SUPPRESS)
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help=
        'run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help=
        'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument('--single',
                        help='run only a single algorithm instance at a time',
                        action='store_true')
    parser.add_argument('--no_save_index',
                        help='do not save indices',
                        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
            X, test_size=manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(args.dataset, args.count, args.limit,
                                      args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                               args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors, len(X_train[0]),
                           point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(target=run_algo,
                                    args=(args.count, X_train, queries,
                                          library, algo, args.distance,
                                          send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit, args.count,
                          args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"

示例#9

显示文件

文件： main.py 项目： yurymalkov/ann-benchmarks

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset',
        metavar='NAME',
        help='the dataset to load training points from',
        default='glove-100-angular',
        choices=DATASETS.keys())
    parser.add_argument(
        "-k", "--count",
        default=10,
        type=positive_int,
        help="the number of near neighbours to search for")
    parser.add_argument(
        '--definitions',
        metavar='FILE',
        help='load algorithm definitions from FILE',
        default='algos.yaml')
    parser.add_argument(
        '--algorithm',
        metavar='NAME',
        help='run only the named algorithm',
        default=None)
    parser.add_argument(
        '--docker-tag',
        metavar='NAME',
        help='run only algorithms in a particular docker image',
        default=None)
    parser.add_argument(
        '--list-algorithms',
        help='print the names of all known algorithms and exit',
        action='store_true')
    parser.add_argument(
        '--force',
        help='''re-run algorithms even if their results already exist''',
        action='store_true')
    parser.add_argument(
        '--runs',
        metavar='COUNT',
        type=positive_int,
        help='run each algorithm instance %(metavar)s times and use only the best result',
        default=3)
    parser.add_argument(
        '--timeout',
        type=int,
        help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
        default=-1)
    parser.add_argument(
        '--local',
        action='store_true',
        help='If set, then will run everything locally (inside the same process) rather than using Docker')
    parser.add_argument(
        '--max-n-algorithms',
        type=int,
        help='Max number of algorithms to run (just used for testing)',
        default=-1)
    parser.add_argument(
        '--run-disabled',
        help='run algorithms that are disabled in algos.yml',
        action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    if args.list_algorithms:
        list_algorithms(args.definitions)
        sys.exit(0)

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    dataset = get_dataset(args.dataset)
    dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
    point_type = 'float' # TODO(erikbern): should look at the type of X_train
    distance = dataset.attrs['distance']
    definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count)

    # Filter out, from the loaded definitions, all those query argument groups
    # that correspond to experiments that have already been run. (This might
    # mean removing a definition altogether, so we can't just use a list
    # comprehension.)
    filtered_definitions = []
    for definition in definitions:
        query_argument_groups = definition.query_argument_groups
        if not query_argument_groups:
            query_argument_groups = [[]]
        not_yet_run = []
        for query_arguments in query_argument_groups:
            fn = get_result_filename(args.dataset,
                    args.count, definition, query_arguments)
            if not os.path.exists(fn):
                not_yet_run.append(query_arguments)
        if not_yet_run:
            if definition.query_argument_groups:
                definition = definition._replace(
                        query_argument_groups = not_yet_run)
            filtered_definitions.append(definition)
    definitions = filtered_definitions

    random.shuffle(definitions)
    
    if args.algorithm:
        print('running only', args.algorithm)
        definitions = [d for d in definitions if d.algorithm == args.algorithm]

    if not args.local:
        # See which Docker images we have available
        docker_client = docker.from_env()
        docker_tags = set()
        for image in docker_client.images.list():
            for tag in image.tags:
                tag, _ = tag.split(':')
                docker_tags.add(tag)

        if args.docker_tag:
            print('running only', args.docker_tag)
            definitions = [d for d in definitions if d.docker_tag == args.docker_tag]

        if set(d.docker_tag for d in definitions).difference(docker_tags):
            print('not all docker images available, only:', set(docker_tags))
            print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags))
            definitions = [d for d in definitions if d.docker_tag in docker_tags]
    else:
        def _test(df):
            status = algorithm_status(df)
            # If the module was loaded but doesn't actually have a constructor of
            # the right name, then the definition is broken
            assert status != InstantiationStatus.NO_CONSTRUCTOR, """\
%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module)
            if status == InstantiationStatus.NO_MODULE:
                # If the module couldn't be loaded (presumably because of a missing
                # dependency), print a warning and remove this definition from the
                # list of things to be run
                print("""\
%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module))
                return False
            else:
                return True
        definitions = [d for d in definitions if _test(d)]

    if not args.run_disabled:
        if len([d for d in definitions if d.disabled]):
            print('Not running disabled algorithms:', [d for d in definitions if d.disabled])
        definitions = [d for d in definitions if not d.disabled]

    if args.max_n_algorithms >= 0:
        definitions = definitions[:args.max_n_algorithms]

    if len(definitions) == 0:
        raise Exception('Nothing to run')
    else:
        print('Order:', definitions)

    for definition in definitions:
        print(definition, '...')

        try:
            if args.local:
                run(definition, args.dataset, args.count, args.runs)
            else:
                run_docker(definition, args.dataset, args.count, args.runs)
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()

示例#10

显示文件

文件： main.py 项目： maumueller/ann-benchmarks

def main():
    parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
            '--dataset',
            metavar='NAME',
            help='the dataset to load training points from',
            default='glove')
    parser.add_argument(
            '--query-dataset',
            metavar='NAME',
            help='load query points from another dataset instead of choosing them randomly from the training dataset',
            default=None)
    parser.add_argument(
            "-k", "--count",
            default=10,
            type=positive_int,
            help="the number of near neighbours to search for")
    parser.add_argument(
            '--distance',
            help='the metric used to calculate the distance between points',
            default='angular')
    parser.add_argument(
            '--limit',
            help='the maximum number of points to load from the dataset, or -1 to load all of them',
            type=int,
            default=-1)
    parser.add_argument(
            '--definitions',
            metavar='FILE',
            help='load algorithm definitions from FILE',
            default='algos.yaml')
    parser.add_argument(
            '--algorithm',
            metavar='NAME',
            help='run only the named algorithm',
            default=None)
    parser.add_argument(
            '--sub-algorithm',
            metavar='NAME',
            help='run only the named instance of an algorithm (requires --algo)',
            default=None)
    parser.add_argument(
            '--list-algorithms',
            help='print the names of all known algorithms and exit',
            action='store_true',
            default=argparse.SUPPRESS)
    parser.add_argument(
            '--force',
            help='''re-run algorithms even if their results already exist''',
            action='store_true')
    parser.add_argument(
            '--runs',
            metavar='COUNT',
            type=positive_int,
            help='run each algorithm instance %(metavar)s times and use only the best result',
            default=3)
    parser.add_argument(
            '--timeout',
            type=int,
            help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
            default=-1)
    parser.add_argument(
            '--single',
            help='run only a single algorithm instance at a time',
            action='store_true')
    parser.add_argument(
            '--no_save_index',
            help='do not save indices',
            action='store_true')

    args = parser.parse_args()
    if args.timeout == -1:
        args.timeout = None

    definitions = get_definitions(args.definitions)
    if hasattr(args, "list_algorithms"):
        print "The following algorithms are supported..."
        for point in definitions:
            print "\t... for the point type '%s'..." % point
            for metric in definitions[point]:
                print "\t\t... and the distance metric '%s':" % metric
                for algorithm in definitions[point][metric]:
                    print "\t\t\t%s" % algorithm
        sys.exit(0)

    # Set resource limits to prevent memory bombs
    memory_limit = 12 * 2**30
    soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
    if soft == resource.RLIM_INFINITY or soft >= memory_limit:
        print('resetting memory limit from', soft, 'to', memory_limit)
        resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))

    # Nmslib specific code
    # Remove old indices stored on disk
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR)

    manifest, X = get_dataset(args.dataset, args.limit)
    if not args.query_dataset:
        X_train, X_test = split_dataset(
                X, test_size = manifest['dataset']['test_size'])
    else:
        X_train = X
        query_manifest, X_test = get_dataset(args.query_dataset)
        assert manifest["dataset"] == query_manifest["dataset"], """\
error: the training dataset and query dataset have incompatible manifests"""

    queries_fn = get_query_cache_path(
        args.dataset, args.count, args.limit, args.distance, args.query_dataset)
    print('storing queries in', queries_fn)

    if not os.path.exists(queries_fn):
        queries = compute_distances(args.distance, args.count, X_train, X_test)
        with open(queries_fn, 'w') as f:
            pickle.dump(queries, f)
    else:
        with open(queries_fn) as f:
            queries = pickle.load(f)

    print('got', len(queries), 'queries')

    algos_already_run = set()
    if not args.force:
        for run in get_results(args.dataset, args.limit, args.count,
                args.distance, args.query_dataset):
            algos_already_run.add((run["library"], run["name"]))

    point_type = manifest['dataset']['point_type']
    algos = get_algorithms(definitions, constructors,
        len(X_train[0]), point_type, args.distance, args.count)

    if args.algorithm:
        print('running only', args.algorithm)
        algos = {args.algorithm: algos[args.algorithm]}
        if args.sub_algorithm:
            algos[args.algorithm] = \
              [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm]

    algos_flat = []

    for library in algos.keys():
        for algo in algos[library]:
            if (library, algo.name) not in algos_already_run:
                algos_flat.append((library, algo))

    random.shuffle(algos_flat)

    print('order:', [a.name for l, a in algos_flat])

    for library, algo in algos_flat:
        recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
        print(algo.name, '...')
        # Spawn a subprocess to force the memory to be reclaimed at the end
        p = multiprocessing.Process(
            target=run_algo,
            args=(args.count, X_train, queries, library, algo,
                  args.distance, send_pipe, args.runs, args.single))

        p.start()
        send_pipe.close()

        timed_out = False
        try:
            results = recv_pipe.poll(args.timeout)
            if results:
                # If there's something waiting in the pipe at this point, then
                # the worker has begun sending us results and we should receive
                # them
                results = recv_pipe.recv()
                if "expect_extra" in results:
                    if results["expect_extra"]:
                        results["extra"] = recv_pipe.recv()
                    del results["expect_extra"]
            else:
                # If we've exceeded the timeout and there are no results, then
                # terminate the worker process (XXX: what should we do about
                # algo.done() here?)
                p.terminate()
                timed_out = True
                results = None
        except EOFError:
            # The worker has crashed or otherwise failed to send us results
            results = None
        p.join()
        recv_pipe.close()

        if results:
            store_results(results, args.dataset, args.limit,
                    args.count, args.distance, args.query_dataset)
        elif timed_out:
            print "(algorithm worker process took too long)"
        else:
            print "(algorithm worker process stopped unexpectedly)"