def _create_parameter_server(): if framework_test_util.is_xla_enabled(): # To address test failures resulting in XLA with MultiProcessRunner, # continue to use in-process cluster for XLA tests. cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, rpc_layer="grpc") return _create_ps_strategy(resolver, variable_partitioner) else: tf_config = cluster_resolver.TFConfigClusterResolver() cluster_def = tf_config.cluster_spec().as_dict() if not cluster_def: # When MultiProcessRunner cluster is used, the cluster is not created # initially when the decorator is called. When the test runs, initially # this method is invoked via decorator before setting up the # MultiProcessRunner with worker and ps in the combinations.py. After # setup is done, the subprocess invokes this method again to get # strategy object. We return None strategy when the main thread invokes # this method before setting up cluster. # Returning None is fine here, since this thread will proceed to create # MultiProcessRunner and invoke tests with decorator inside # subprocesses. return None # MultiProcessRunner is already setup and this method is invoked from a # subprocess running the actual test. resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, rpc_layer=tf_config.rpc_layer or "grpc") if tf_config.task_type in ("worker", "ps"): worker_config = config_pb2.ConfigProto() worker_config.inter_op_parallelism_threads = 4 # max num_workers + 1 try: server = server_lib.Server(cluster_def, job_name=tf_config.task_type, task_index=tf_config.task_id, protocol="grpc", config=worker_config) except errors.UnknownError as e: if "Could not start gRPC server" in e.message: raise unittest.SkipTest("Cannot start std servers.") else: raise # Blocking the process that starts a server from exiting. server.join() return _create_ps_strategy(resolver, variable_partitioner)
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def testKeepLogicalDevice(self): gpus = tf_config.list_physical_devices('GPU') if len(gpus) > 1: self.skipTest( 'Skip logical device test on multi GPUs, since partial GPU ' 'virtualization is not permitted.') # Cannot change logical device after the context initialization. context._reset_context() # pylint: disable=protected-access cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=False, num_workers=1) resolver = cluster_resolver_lib.SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec( cluster_spec), task_type='worker', task_id=0) logical_gpus = len(gpus) * 2 for i, device in enumerate(gpus): n = (i + 1) * logical_gpus // len(gpus) - i * logical_gpus // len(gpus) assert n > 0 # guaranteed if count >= len(devices) configs = [] for ordinal in range(n): config = context.LogicalDeviceConfiguration( memory_limit=64, experimental_device_ordinal=ordinal) configs.append(config) tf_config.set_logical_device_configuration(device, configs) collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # Since we create two logical GPUs out of the last GPU, there should be one # more logical GPUs than physical GPUs. self.assertLen(tf_config.list_logical_devices('GPU'), logical_gpus) context._reset_context() # pylint: disable=protected-access
def _get_test_objects(self, task_type, task_id, num_gpus=0, communication=CollectiveCommunication.AUTO, use_strategy_object=False, local_mode=False): collective_keys = cross_device_utils.CollectiveKeys( group_key_start=10 + CollectiveAllReduceTest.collective_key_base) if local_mode: if num_gpus: devices = ["/device:GPU:%d" % i for i in range(num_gpus)] else: devices = ["/device:CPU:0"] if use_strategy_object: strategy = (mwms_lib.CollectiveAllReduceStrategy ._from_local_devices(devices, communication=communication)) # pylint: disable=protected-access return strategy, devices, "" else: collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce( devices=devices, group_size=len(devices), collective_keys=collective_keys) return collective_all_reduce_ops, devices, "" else: # NCCL requires physical GPUs for every replica, which we can't do with # simulated multi host set up now. assert communication != CollectiveCommunication.NCCL if num_gpus: devices = [ "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i) for i in range(num_gpus) ] else: devices = [ "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id) ] if use_strategy_object: resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec( self._cluster_spec), task_type=task_type, task_id=task_id, num_accelerators={"GPU": num_gpus}) strategy = mwms_lib.CollectiveAllReduceStrategy( cluster_resolver=resolver, communication=communication) return (strategy, devices, "grpc://" + self._cluster_spec[task_type][task_id]) else: collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce( devices=devices, group_size=len(devices) * NUM_WORKERS, collective_keys=collective_keys) return (collective_all_reduce_ops, devices, "grpc://" + self._cluster_spec[task_type][task_id])
def _create_parameter_server(): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") resolver = cluster_resolver.SimpleClusterResolver( ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, rpc_layer="grpc") strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( resolver, variable_partitioner=sharded_variable.FixedShardsPartitioner(2)) return strategy
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Disable health check and coordination service. We don't have a reliable # way to shutdown the strategy (and thus the strategy health check or # coordination service heartbeat) at the end of a test. Turning on the # strategy health check or coordination service heartbeat causes some # flakiness since we re-create part of the server when creating a strategy, # and our tests are capable of handling failures. CollectiveAllReduceExtended._enable_check_health = False # pylint: disable=protected-access context.context().configure_coordination_service(service_type="") # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver) if not use_merge_call: strategy.extended._use_merge_call = lambda: False # pylint: disable=protected-access # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.get_barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.get_barrier() raises ValueError, which is safe to # ignore. pass return strategy
def testKeepLogicalDevice(self): # Cannot change logical device after the context initialization. context._reset_context() # pylint: disable=protected-access cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=False, num_workers=1) resolver = cluster_resolver_lib.SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), task_type='worker', task_id=0) gpus = tf_config.list_physical_devices('GPU') tf_config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(64), context.LogicalDeviceConfiguration(64), ]) collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # Since we create two logical GPUs out of the last GPU, there should be one # more logical GPUs than physical GPUs. self.assertLen(tf_config.list_logical_devices('GPU'), len(gpus) + 1) context._reset_context() # pylint: disable=protected-access
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer, ) strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy