def testClusterSpec(self): # Use cluster_spec config = k8s_tensorflow_lib.GenerateConfig(num_workers=1, num_param_servers=1, port=5000, request_load_balancer=True, docker_image='test_image', name_prefix='abc', use_shared_volume=False, use_cluster_spec=True) self.assertFalse('worker_hosts' in config) self.assertFalse('ps_hosts' in config) self.assertTrue( '"--cluster_spec=worker|abc-worker0:5000,ps|abc-ps0:5000"' in config) # Don't use cluster_spec config = k8s_tensorflow_lib.GenerateConfig(num_workers=1, num_param_servers=1, port=5000, request_load_balancer=True, docker_image='test_image', name_prefix='abc', use_shared_volume=False, use_cluster_spec=False) self.assertFalse('cluster_spec' in config) self.assertTrue('"--worker_hosts=abc-worker0:5000"' in config) self.assertTrue('"--ps_hosts=abc-ps0:5000"' in config)
def testGenerateConfig_SharedVolume(self): # Use shared directory config = k8s_tensorflow_lib.GenerateConfig(num_workers=1, num_param_servers=1, port=5000, request_load_balancer=False, docker_image='test_image', name_prefix='abc', use_shared_volume=True) self.assertTrue('/shared' in config) # Don't use shared directory config = k8s_tensorflow_lib.GenerateConfig(num_workers=1, num_param_servers=1, port=5000, request_load_balancer=False, docker_image='test_image', name_prefix='abc', use_shared_volume=False) self.assertFalse('/shared' in config)
def testEnvVar(self): # Use loadbalancer config = k8s_tensorflow_lib.GenerateConfig( num_workers=1, num_param_servers=1, port=5000, request_load_balancer=True, docker_image='test_image', name_prefix='abc', use_shared_volume=False, env_vars={'test1': 'test1_value', 'test2': 'test2_value'}) self.assertTrue('{name: "test1", value: "test1_value"}' in config) self.assertTrue('{name: "test2", value: "test2_value"}' in config)
def main(): config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'benchmark_configs.yml') config_text = open(config_file, 'r').read() configs = yaml.load(config_text) docker_client = docker.from_env() time_tag = datetime.now().strftime('%d_%m_%Y_%H_%M') # Create directories to store kubernetes yaml configs in. if not os.path.isdir(FLAGS.config_output_file_dir): os.makedirs(FLAGS.config_output_file_dir) # Keeps track of already built docker images in case multiple benchmarks # use the same docker image. benchmark_name_to_docker_image = {} # TODO(annarev): run benchmarks in parallel instead of sequentially. for config in configs: name = _ConvertToValidName(str(config['benchmark_name'])) if name in benchmark_name_to_docker_image: docker_image = benchmark_name_to_docker_image[name] else: docker_image = _BuildAndPushDockerImage( docker_client, config['docker_file'], name, time_tag, FLAGS.store_docker_image_in_gcloud) benchmark_name_to_docker_image[name] = docker_image env_vars = { _OUTPUT_FILE_ENV_VAR: os.path.join(FLAGS.benchmark_results_dir, name + '.json'), _TEST_NAME_ENV_VAR: name } env_vars.update(config.get('env_vars', {})) args = config.get('args', {}) kubernetes_config = k8s_tensorflow_lib.GenerateConfig( config['worker_count'], config['ps_count'], _PORT, request_load_balancer=False, docker_image=docker_image, name_prefix=name, additional_args=args, env_vars=env_vars, use_shared_volume=False, use_cluster_spec=False) kubernetes_config_path = os.path.join(FLAGS.config_output_file_dir, name + '.yaml') with open(kubernetes_config_path, 'w') as output_config_file: output_config_file.write(kubernetes_config) _RunBenchmark(name, kubernetes_config_path)
def main(): config_text = open(FLAGS.benchmark_configs_file, 'r').read() configs = yaml.load(config_text) docker_client = docker.from_env() time_tag = datetime.now().strftime('%d_%m_%Y_%H_%M') # Create directories to store kubernetes yaml configs in. if not os.path.isdir(FLAGS.config_output_file_dir): os.makedirs(FLAGS.config_output_file_dir) # Keeps track of already built docker images in case multiple benchmarks # use the same docker image. benchmark_name_to_docker_image = {} # TODO(annarev): run benchmarks in parallel instead of sequentially. for config in configs: name = _ConvertToValidName(str(config['benchmark_name'])) if name in benchmark_name_to_docker_image: docker_image = benchmark_name_to_docker_image[name] elif FLAGS.build_docker_image: docker_image = _BuildAndPushDockerImage( docker_client, config['docker_file'], name, time_tag, FLAGS.store_docker_image_in_gcloud) benchmark_name_to_docker_image[name] = docker_image else: docker_image = _GetMostRecentDockerImageFromGcloud( _DOCKER_IMAGE_PATTERN % name) if not docker_image: raise NoImageFoundError('No tags found for image %s.' % docker_image) env_vars = { _OUTPUT_FILE_ENV_VAR: os.path.join(FLAGS.benchmark_results_dir, name + '.json'), _TEST_NAME_ENV_VAR: name } gpu_count = (0 if 'gpus_per_machine' not in config else config['gpus_per_machine']) volumes = {} if gpu_count > 0: volumes = get_gpu_volume_mounts() env_vars['LD_LIBRARY_PATH'] = ( '/usr/lib/cuda:/usr/lib/nvidia:/usr/lib/x86_64-linux-gnu') env_vars.update(config.get('env_vars', {})) args = config.get('args', {}) kubernetes_config = k8s_tensorflow_lib.GenerateConfig( config['worker_count'], config['ps_count'], _PORT, request_load_balancer=False, docker_image=docker_image, name_prefix=name, additional_args=args, env_vars=env_vars, volumes=volumes, use_shared_volume=False, use_cluster_spec=False, gpu_limit=gpu_count) kubernetes_config_path = os.path.join(FLAGS.config_output_file_dir, name + '.yaml') with open(kubernetes_config_path, 'w') as output_config_file: output_config_file.write(kubernetes_config) success = _RunBenchmark(name, kubernetes_config_path) if not success: sys.exit(1)
def main(): """Do arg parsing.""" parser = argparse.ArgumentParser() parser.register('type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes')) parser.add_argument('--num_workers', type=int, default=2, help='How many worker pods to run') parser.add_argument('--num_parameter_servers', type=int, default=1, help='How many paramater server pods to run') parser.add_argument('--grpc_port', type=int, default=DEFAULT_PORT, help='GRPC server port (Default: %d)' % DEFAULT_PORT) parser.add_argument( '--request_load_balancer', type='bool', default=False, help='To request worker0 to be exposed on a public IP ' 'address via an external load balancer, enabling you to ' 'run client processes from outside the cluster') parser.add_argument( '--docker_image', type=str, default=DEFAULT_DOCKER_IMAGE, help='Override default docker image for the TensorFlow ' 'GRPC server') parser.add_argument('--name_prefix', type=str, default='tf', help='Prefix for job names. Jobs will be named as ' '<name_prefix>_worker|ps<task_id>') parser.add_argument('--use_shared_volume', type='bool', default=True, help='Whether to mount /shared directory from host to ' 'the pod') args = parser.parse_args() if args.num_workers <= 0: sys.stderr.write( '--num_workers must be greater than 0; received %d\n' % args.num_workers) sys.exit(1) if args.num_parameter_servers <= 0: sys.stderr.write( '--num_parameter_servers must be greater than 0; received %d\n' % args.num_parameter_servers) sys.exit(1) # Generate contents of yaml config yaml_config = k8s_tensorflow_lib.GenerateConfig( args.num_workers, args.num_parameter_servers, args.grpc_port, args.request_load_balancer, args.docker_image, args.name_prefix, env_vars=None, use_shared_volume=args.use_shared_volume) print(yaml_config) # pylint: disable=superfluous-parens
def main(): parser = argparse.ArgumentParser() parser.register('type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes')) parser.add_argument('--benchmark_configs_file', type=str, default=None, required=True, help='YAML file with benchmark configs.') parser.add_argument('--benchmark_config_output', type=str, default=None, required=True, help='YAML file to store final config.') parser.add_argument('--docker_image', type=str, default=None, required=True, help='Docker iage to use on K8S to run test.') parser.add_argument( '--cuda_lib_dir', type=str, default=None, required=False, help='Directory where cuda library files are located on gcloud node.') parser.add_argument( '--nvidia_lib_dir', type=str, default=None, required=False, help='Directory where nvidia library files are located on gcloud node.' ) flags, _ = parser.parse_known_args() logging.basicConfig(level=logging.DEBUG) config_base_path = os.path.dirname(__file__) config_text = open( os.path.join(config_base_path, flags.benchmark_configs_file), 'r').read() configs = yaml.load(config_text) # TODO(annarev): run benchmarks in parallel instead of sequentially. for config in configs: name = _ConvertToValidName(str(config['benchmark_name'])) env_vars = {_TEST_NAME_ENV_VAR: name} gpu_count = (0 if 'gpus_per_machine' not in config else config['gpus_per_machine']) volumes = {} if gpu_count > 0: volumes = _GetGpuVolumeMounts(flags) env_vars['LD_LIBRARY_PATH'] = ( '/usr/lib/cuda:/usr/lib/nvidia:/usr/lib/x86_64-linux-gnu') env_vars.update(config.get('env_vars', {})) args = config.get('args', {}) kubernetes_config = k8s_tensorflow_lib.GenerateConfig( config['worker_count'], config['ps_count'], _PORT, request_load_balancer=False, docker_image=flags.docker_image, name_prefix=name, additional_args=args, env_vars=env_vars, volumes=volumes, use_shared_volume=False, use_cluster_spec=False, gpu_limit=gpu_count) with open(flags.benchmark_config_output, 'w') as output_config_file: output_config_file.write(kubernetes_config)