def make_collective(self, num_processes, gpu_per_process, communication): """Returns collectives and other info to be used in tests. Args: num_processes: an integer indicating the number of processes that participate in the collective. gpu_per_process: number of GPUs (0 if no GPUs) used by each process. communication: one of `CollectiveCommunication`. Returns: A tuple of (collective, devices, group_size) where collective is a instance of `CollectiveAllReduce`, devices are a list of local devices (str) attached to the current process, and group_size is the group_size of collective. """ cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() devices = [ "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id ] if gpu_per_process > 0: devices = [ "/job:worker/replica:0/task:%d/device:GPU:%d" % (cluster_resolver.task_id, i) for i in range(gpu_per_process) ] group_size = num_processes * len(devices) collective = cross_device_ops_lib.CollectiveAllReduce( devices=devices, group_size=group_size, communication=communication) return collective, devices, cluster_resolver.task_id
def worker_fn(): cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops(cluster_resolver) collective_ops.all_reduce( constant_op.constant(1.), group_size=2, group_key=100, instance_key=100, merge_op="Add", final_op="Id", communication_hint="ring") if cluster_resolver.task_type == "worker": # MultiProcessRunner will auto restart worker-0. os._exit(1) # pylint: disable=protected-access else: # chief should eventually gets FailedPreconditionError after worker-0 # has restarted. while True: time.sleep(1) try: context.context().check_collective_ops_peer_health( "/job:worker/replica:0/task:0",) except errors.UnavailableError: pass except errors.FailedPreconditionError: break
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def testAbortCommunication(self, device, communication): if communication == "NCCL": self.skipTest("b/171358086: cannot test multi worker NCCL") dev0 = "/device:%s:0" % device cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops_with_barrier(cluster_resolver) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # First perform a normal all-reduce to complete the group and instance # resolution. with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) if cluster_resolver.task_id == 1: def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down") t = threading.Thread(target=abort_fn) t.start() with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication) t.join() # Enable collective ops again in order to reset the collective executor. enable_collective_ops_with_barrier(cluster_resolver) with ops.device(dev0): collective_ops.all_reduce( in_tensor, group_size, group_key, instance_key, communication_hint=communication)
def testAbortInstanceParamsResolution(self, device, communication): if communication == "NCCL": self.skipTest("b/171358086: cannot test multi worker NCCL") dev0 = "/device:%s:0" % device cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() enable_collective_ops_with_barrier(cluster_resolver) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # First perform a normal all-reduce to complete the group resolution. with ops.device(dev0): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key) # We use broadcast to test aborting instance resolution since only broadcast # waits for the group. if cluster_resolver.task_id == 1: def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down") t = threading.Thread(target=abort_fn) t.start() # Use a different instance key to trigger another instance resolution. instance_key = 101 with self.assertRaisesRegex(errors.UnavailableError, "peer down"): # This hangs on params resolution since we're only launching one # collective for a group size of 2. with ops.device(dev0): collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, "peer down"): with ops.device(dev0): collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) t.join() # Enable collective ops again in order to reset the collective executor. enable_collective_ops_with_barrier(cluster_resolver) # Reassign instance_key so that it's the same on each worker. instance_key = 100 with ops.device(dev0): if cluster_resolver.task_id == 0: collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32, group_size, group_key, instance_key) else: collective_ops.broadcast_recv( (1, ), dtypes.float32, group_size, group_key, instance_key)
def _create_parameter_server(): if framework_test_util.is_xla_enabled(): # To address test failures resulting in XLA with MultiProcessRunner, # continue to use in-process cluster for XLA tests. cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, rpc_layer="grpc") return _create_ps_strategy(resolver, variable_partitioner) else: tf_config = cluster_resolver.TFConfigClusterResolver() cluster_def = tf_config.cluster_spec().as_dict() if not cluster_def: # When MultiProcessRunner cluster is used, the cluster is not created # initially when the decorator is called. When the test runs, initially # this method is invoked via decorator before setting up the # MultiProcessRunner with worker and ps in the combinations.py. After # setup is done, the subprocess invokes this method again to get # strategy object. We return None strategy when the main thread invokes # this method before setting up cluster. # Returning None is fine here, since this thread will proceed to create # MultiProcessRunner and invoke tests with decorator inside # subprocesses. return None # MultiProcessRunner is already setup and this method is invoked from a # subprocess running the actual test. resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, rpc_layer=tf_config.rpc_layer or "grpc") if tf_config.task_type in ("worker", "ps"): worker_config = config_pb2.ConfigProto() worker_config.inter_op_parallelism_threads = 4 # max num_workers + 1 try: server = server_lib.Server(cluster_def, job_name=tf_config.task_type, task_index=tf_config.task_id, protocol="grpc", config=worker_config) except errors.UnknownError as e: if "Could not start gRPC server" in e.message: raise unittest.SkipTest("Cannot start std servers.") else: raise # Blocking the process that starts a server from exiting. server.join() return _create_ps_strategy(resolver, variable_partitioner)
def worker_fn(): enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver()) # There may be some delays before the server startup. Check health should # eventually be OK. while True: try: for task in [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1", ]: context.context().check_collective_ops_peer_health(task) except errors.UnavailableError: continue break multi_process_runner.get_barrier().wait()
def enable_collective_ops(): """Enable collectives in the current process.""" cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() context.context().configure_collective_ops( collective_leader="'/job:worker/replica:0/task:0'") config_proto = config_pb2.ConfigProto() config_proto.experimental.collective_group_leader = ( "/job:worker/replica:0/task:0") server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_resolver.cluster_spec().as_cluster_def(), default_session_config=config_proto, job_name=cluster_resolver.task_type, task_index=cluster_resolver.task_id, protocol=cluster_resolver.rpc_layer) context.context().enable_collective_ops(server_def)
def enable_collectives(self, num_processes, gpu_per_process, communication): """Enable collectives in the current process. Args: num_processes: an integer indicating the number of processes that participate in the collective. gpu_per_process: number of GPUs (0 if no GPUs) used by each process. communication: one of `CollectiveCommunication`. Returns: A tuple of (collective, devices, group_size) where collective is a instance of `CollectiveAllReduce`, devices are a list of local devices (str) attached to the current process, and group_size is the group_size of collective. """ cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() context.context().configure_collective_ops( collective_leader="'/job:worker/replica:0/task:0'") config_proto = config_pb2.ConfigProto() config_proto.experimental.collective_group_leader = ( "/job:worker/replica:0/task:0") server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_resolver.cluster_spec().as_cluster_def(), default_session_config=config_proto, job_name=cluster_resolver.task_type, task_index=cluster_resolver.task_id, protocol=cluster_resolver.rpc_layer) context.context().enable_collective_ops(server_def) devices = [ "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id ] if gpu_per_process > 0: devices = [ "/job:worker/replica:0/task:%d/device:GPU:%d" % (cluster_resolver.task_id, i) for i in range(gpu_per_process) ] group_size = num_processes * len(devices) collective = cross_device_ops_lib.CollectiveAllReduce( devices=devices, group_size=group_size, communication=communication) return collective, devices, cluster_resolver.task_id
def enable_collective_ops(): """Enable collectives in the current process.""" cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() context.context().configure_collective_ops( collective_leader="'/job:worker/replica:0/task:0'") config_proto = config_pb2.ConfigProto() config_proto.experimental.collective_group_leader = ( "/job:worker/replica:0/task:0") server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_resolver.cluster_spec().as_cluster_def(), default_session_config=config_proto, job_name=cluster_resolver.task_type, task_index=cluster_resolver.task_id, protocol=cluster_resolver.rpc_layer) context.context().enable_collective_ops(server_def) # Recover default flag values. CollectiveReplicaLauncher._prefer_unique_instance_key = True CollectiveReplicaLauncher._prefer_ordering_token = False
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() master = tf_config.master() if tf_config.rpc_layer: # Strip off the rpc_layer suffix. master = master[len("%s://" % tf_config.rpc_layer):] resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, master=master, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer or "grpc", ) # Disable health check and coordination service. We don't have a reliable # way to shutdown the strategy (and thus the strategy health check or # coordination service heartbeat) at the end of a test. Turning on the # strategy health check or coordination service heartbeat causes some # flakiness since we re-create part of the server when creating a strategy, # and our tests are capable of handling failures. CollectiveAllReduceExtended._enable_check_health = False # pylint: disable=protected-access context.context().configure_coordination_service(service_type="") # Always create the strategy in eager mode so that it starts the server and # configures the eager context. The eager context can no longer be # configured after initialization. with context.eager_mode(): strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver) if not use_merge_call: strategy.extended._use_merge_call = lambda: False # pylint: disable=protected-access # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.get_barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.get_barrier() raises ValueError, which is safe to # ignore. pass return strategy
def enable_collective_ops(): """Enable collectives in the current process.""" cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver() context.context().configure_collective_ops( collective_leader="'/job:worker/replica:0/task:0'") config_proto = config_pb2.ConfigProto() config_proto.experimental.collective_group_leader = ( "/job:worker/replica:0/task:0") server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_resolver.cluster_spec().as_cluster_def(), default_session_config=config_proto, job_name=cluster_resolver.task_type, task_index=cluster_resolver.task_id, protocol=cluster_resolver.rpc_layer) context.context().enable_collective_ops(server_def) # Recover default flag values. cross_device_ops_lib.CollectiveAllReduce._limited_nccl = True cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = False cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = True cross_device_utils.CollectiveReplicaLauncher._use_ordering_token = False
def _create_multi_worker_mirrored(): tf_config = cluster_resolver.TFConfigClusterResolver() resolver = cluster_resolver.SimpleClusterResolver( cluster_spec=tf_config.cluster_spec(), task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, num_accelerators={"GPU": required_gpus}, rpc_layer=tf_config.rpc_layer, ) strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # TODO(b/152320929): Wait for the cluster before proceeding, otherwise # collectives may hang if any worker launches collectives before the chief # creates the strategy. try: multi_process_runner.barrier().wait() except ValueError: # If the creator is called in the main process, # multi_process_runner.barrier() raises ValueError, which is safe to # ignore. pass return strategy
def worker_fn(): enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver()) context.context().check_collective_ops_peer_health( "/job:worker/replica:0/task:1",)
def worker_fn(): enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver()) context.context().check_collective_ops_peer_health("localhost:12345",)