示例#1
0
    def testClusterSpec(self):
        # Use cluster_spec
        config = k8s_tensorflow_lib.GenerateConfig(num_workers=1,
                                                   num_param_servers=1,
                                                   port=5000,
                                                   request_load_balancer=True,
                                                   docker_image='test_image',
                                                   name_prefix='abc',
                                                   use_shared_volume=False,
                                                   use_cluster_spec=True)
        self.assertFalse('worker_hosts' in config)
        self.assertFalse('ps_hosts' in config)
        self.assertTrue(
            '"--cluster_spec=worker|abc-worker0:5000,ps|abc-ps0:5000"' in
            config)

        # Don't use cluster_spec
        config = k8s_tensorflow_lib.GenerateConfig(num_workers=1,
                                                   num_param_servers=1,
                                                   port=5000,
                                                   request_load_balancer=True,
                                                   docker_image='test_image',
                                                   name_prefix='abc',
                                                   use_shared_volume=False,
                                                   use_cluster_spec=False)
        self.assertFalse('cluster_spec' in config)
        self.assertTrue('"--worker_hosts=abc-worker0:5000"' in config)
        self.assertTrue('"--ps_hosts=abc-ps0:5000"' in config)
示例#2
0
    def testGenerateConfig_SharedVolume(self):
        # Use shared directory
        config = k8s_tensorflow_lib.GenerateConfig(num_workers=1,
                                                   num_param_servers=1,
                                                   port=5000,
                                                   request_load_balancer=False,
                                                   docker_image='test_image',
                                                   name_prefix='abc',
                                                   use_shared_volume=True)
        self.assertTrue('/shared' in config)

        # Don't use shared directory
        config = k8s_tensorflow_lib.GenerateConfig(num_workers=1,
                                                   num_param_servers=1,
                                                   port=5000,
                                                   request_load_balancer=False,
                                                   docker_image='test_image',
                                                   name_prefix='abc',
                                                   use_shared_volume=False)
        self.assertFalse('/shared' in config)
 def testEnvVar(self):
   # Use loadbalancer
   config = k8s_tensorflow_lib.GenerateConfig(
       num_workers=1,
       num_param_servers=1,
       port=5000,
       request_load_balancer=True,
       docker_image='test_image',
       name_prefix='abc',
       use_shared_volume=False,
       env_vars={'test1': 'test1_value', 'test2': 'test2_value'})
   self.assertTrue('{name: "test1", value: "test1_value"}' in config)
   self.assertTrue('{name: "test2", value: "test2_value"}' in config)
示例#4
0
def main():
    config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'benchmark_configs.yml')
    config_text = open(config_file, 'r').read()
    configs = yaml.load(config_text)

    docker_client = docker.from_env()
    time_tag = datetime.now().strftime('%d_%m_%Y_%H_%M')
    # Create directories to store kubernetes yaml configs in.
    if not os.path.isdir(FLAGS.config_output_file_dir):
        os.makedirs(FLAGS.config_output_file_dir)
    # Keeps track of already built docker images in case multiple benchmarks
    # use the same docker image.
    benchmark_name_to_docker_image = {}

    # TODO(annarev): run benchmarks in parallel instead of sequentially.
    for config in configs:
        name = _ConvertToValidName(str(config['benchmark_name']))
        if name in benchmark_name_to_docker_image:
            docker_image = benchmark_name_to_docker_image[name]
        else:
            docker_image = _BuildAndPushDockerImage(
                docker_client, config['docker_file'], name, time_tag,
                FLAGS.store_docker_image_in_gcloud)
            benchmark_name_to_docker_image[name] = docker_image
        env_vars = {
            _OUTPUT_FILE_ENV_VAR:
            os.path.join(FLAGS.benchmark_results_dir, name + '.json'),
            _TEST_NAME_ENV_VAR:
            name
        }
        env_vars.update(config.get('env_vars', {}))
        args = config.get('args', {})
        kubernetes_config = k8s_tensorflow_lib.GenerateConfig(
            config['worker_count'],
            config['ps_count'],
            _PORT,
            request_load_balancer=False,
            docker_image=docker_image,
            name_prefix=name,
            additional_args=args,
            env_vars=env_vars,
            use_shared_volume=False,
            use_cluster_spec=False)

        kubernetes_config_path = os.path.join(FLAGS.config_output_file_dir,
                                              name + '.yaml')
        with open(kubernetes_config_path, 'w') as output_config_file:
            output_config_file.write(kubernetes_config)

        _RunBenchmark(name, kubernetes_config_path)
示例#5
0
def main():
    config_text = open(FLAGS.benchmark_configs_file, 'r').read()
    configs = yaml.load(config_text)

    docker_client = docker.from_env()
    time_tag = datetime.now().strftime('%d_%m_%Y_%H_%M')
    # Create directories to store kubernetes yaml configs in.
    if not os.path.isdir(FLAGS.config_output_file_dir):
        os.makedirs(FLAGS.config_output_file_dir)
    # Keeps track of already built docker images in case multiple benchmarks
    # use the same docker image.
    benchmark_name_to_docker_image = {}

    # TODO(annarev): run benchmarks in parallel instead of sequentially.
    for config in configs:
        name = _ConvertToValidName(str(config['benchmark_name']))
        if name in benchmark_name_to_docker_image:
            docker_image = benchmark_name_to_docker_image[name]
        elif FLAGS.build_docker_image:
            docker_image = _BuildAndPushDockerImage(
                docker_client, config['docker_file'], name, time_tag,
                FLAGS.store_docker_image_in_gcloud)
            benchmark_name_to_docker_image[name] = docker_image
        else:
            docker_image = _GetMostRecentDockerImageFromGcloud(
                _DOCKER_IMAGE_PATTERN % name)
            if not docker_image:
                raise NoImageFoundError('No tags found for image %s.' %
                                        docker_image)

        env_vars = {
            _OUTPUT_FILE_ENV_VAR:
            os.path.join(FLAGS.benchmark_results_dir, name + '.json'),
            _TEST_NAME_ENV_VAR:
            name
        }
        gpu_count = (0 if 'gpus_per_machine' not in config else
                     config['gpus_per_machine'])
        volumes = {}
        if gpu_count > 0:
            volumes = get_gpu_volume_mounts()
            env_vars['LD_LIBRARY_PATH'] = (
                '/usr/lib/cuda:/usr/lib/nvidia:/usr/lib/x86_64-linux-gnu')

        env_vars.update(config.get('env_vars', {}))
        args = config.get('args', {})
        kubernetes_config = k8s_tensorflow_lib.GenerateConfig(
            config['worker_count'],
            config['ps_count'],
            _PORT,
            request_load_balancer=False,
            docker_image=docker_image,
            name_prefix=name,
            additional_args=args,
            env_vars=env_vars,
            volumes=volumes,
            use_shared_volume=False,
            use_cluster_spec=False,
            gpu_limit=gpu_count)

        kubernetes_config_path = os.path.join(FLAGS.config_output_file_dir,
                                              name + '.yaml')
        with open(kubernetes_config_path, 'w') as output_config_file:
            output_config_file.write(kubernetes_config)

        success = _RunBenchmark(name, kubernetes_config_path)
        if not success:
            sys.exit(1)
示例#6
0
def main():
    """Do arg parsing."""
    parser = argparse.ArgumentParser()
    parser.register('type', 'bool', lambda v: v.lower() in
                    ('true', 't', 'y', 'yes'))
    parser.add_argument('--num_workers',
                        type=int,
                        default=2,
                        help='How many worker pods to run')
    parser.add_argument('--num_parameter_servers',
                        type=int,
                        default=1,
                        help='How many paramater server pods to run')
    parser.add_argument('--grpc_port',
                        type=int,
                        default=DEFAULT_PORT,
                        help='GRPC server port (Default: %d)' % DEFAULT_PORT)
    parser.add_argument(
        '--request_load_balancer',
        type='bool',
        default=False,
        help='To request worker0 to be exposed on a public IP '
        'address via an external load balancer, enabling you to '
        'run client processes from outside the cluster')
    parser.add_argument(
        '--docker_image',
        type=str,
        default=DEFAULT_DOCKER_IMAGE,
        help='Override default docker image for the TensorFlow '
        'GRPC server')
    parser.add_argument('--name_prefix',
                        type=str,
                        default='tf',
                        help='Prefix for job names. Jobs will be named as '
                        '<name_prefix>_worker|ps<task_id>')
    parser.add_argument('--use_shared_volume',
                        type='bool',
                        default=True,
                        help='Whether to mount /shared directory from host to '
                        'the pod')
    args = parser.parse_args()

    if args.num_workers <= 0:
        sys.stderr.write(
            '--num_workers must be greater than 0; received %d\n' %
            args.num_workers)
        sys.exit(1)
    if args.num_parameter_servers <= 0:
        sys.stderr.write(
            '--num_parameter_servers must be greater than 0; received %d\n' %
            args.num_parameter_servers)
        sys.exit(1)

    # Generate contents of yaml config
    yaml_config = k8s_tensorflow_lib.GenerateConfig(
        args.num_workers,
        args.num_parameter_servers,
        args.grpc_port,
        args.request_load_balancer,
        args.docker_image,
        args.name_prefix,
        env_vars=None,
        use_shared_volume=args.use_shared_volume)
    print(yaml_config)  # pylint: disable=superfluous-parens
示例#7
0
def main():
    parser = argparse.ArgumentParser()
    parser.register('type', 'bool', lambda v: v.lower() in
                    ('true', 't', 'y', 'yes'))
    parser.add_argument('--benchmark_configs_file',
                        type=str,
                        default=None,
                        required=True,
                        help='YAML file with benchmark configs.')
    parser.add_argument('--benchmark_config_output',
                        type=str,
                        default=None,
                        required=True,
                        help='YAML file to store final config.')
    parser.add_argument('--docker_image',
                        type=str,
                        default=None,
                        required=True,
                        help='Docker iage to use on K8S to run test.')
    parser.add_argument(
        '--cuda_lib_dir',
        type=str,
        default=None,
        required=False,
        help='Directory where cuda library files are located on gcloud node.')
    parser.add_argument(
        '--nvidia_lib_dir',
        type=str,
        default=None,
        required=False,
        help='Directory where nvidia library files are located on gcloud node.'
    )

    flags, _ = parser.parse_known_args()
    logging.basicConfig(level=logging.DEBUG)

    config_base_path = os.path.dirname(__file__)

    config_text = open(
        os.path.join(config_base_path, flags.benchmark_configs_file),
        'r').read()
    configs = yaml.load(config_text)

    # TODO(annarev): run benchmarks in parallel instead of sequentially.
    for config in configs:
        name = _ConvertToValidName(str(config['benchmark_name']))

        env_vars = {_TEST_NAME_ENV_VAR: name}
        gpu_count = (0 if 'gpus_per_machine' not in config else
                     config['gpus_per_machine'])
        volumes = {}
        if gpu_count > 0:
            volumes = _GetGpuVolumeMounts(flags)
            env_vars['LD_LIBRARY_PATH'] = (
                '/usr/lib/cuda:/usr/lib/nvidia:/usr/lib/x86_64-linux-gnu')

        env_vars.update(config.get('env_vars', {}))
        args = config.get('args', {})
        kubernetes_config = k8s_tensorflow_lib.GenerateConfig(
            config['worker_count'],
            config['ps_count'],
            _PORT,
            request_load_balancer=False,
            docker_image=flags.docker_image,
            name_prefix=name,
            additional_args=args,
            env_vars=env_vars,
            volumes=volumes,
            use_shared_volume=False,
            use_cluster_spec=False,
            gpu_limit=gpu_count)

        with open(flags.benchmark_config_output, 'w') as output_config_file:
            output_config_file.write(kubernetes_config)