def setUp(self): super(StatefulRandomOpsTest, self).setUp() physical_devices = config.list_physical_devices("CPU") config.set_logical_device_configuration(physical_devices[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ])
def testCollectiveGroupSizeMismatch(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() @def_function.function def run_all_reduce(): group_key = 10 instance_key = 20 t0 = [1, 2, 3, 4] t1 = [5, 6, 7, 8] with ops.device('/CPU:0'): in0 = constant_op.constant(t0) c0 = collective_ops.all_reduce( in0, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') with ops.device('/CPU:1'): in1 = constant_op.constant(t1) c1 = collective_ops.all_reduce( in1, group_size=3, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id') return c0, c1 with self.assertRaisesRegexp(errors.InternalError, 'but that group has size'): run_all_reduce()
def set_up_gpu_memory_limit(memory_limit_mb: int) -> None: gpus = framework_config.list_physical_devices("GPU") virtual_device_config = context.LogicalDeviceConfiguration( memory_limit=memory_limit_mb) for gpu in gpus: framework_config.set_logical_device_configuration( gpu, [virtual_device_config])
def _mimic_two_cpus(): cpus = config.list_physical_devices("CPU") config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), ])
def testMultipleGroups(self): context._reset_context() cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() num_elements = 4 @def_function.function def run_all_reduce(group_size, group_key): instance_key = group_key input_value = [group_key for i in range(num_elements)] collectives = [] for device_idx in range(group_size): with ops.device('/CPU:{}'.format(device_idx)): input_tensor = constant_op.constant(input_value) collectives.append(collective_ops.all_reduce( input_tensor, group_size, group_key, instance_key, merge_op='Add', final_op='Id')) return collectives def run_and_assert(group_size, group_key): for reduced_tensor in run_all_reduce(group_size, group_key): self.assertAllEqual( [group_key * group_size for i in range(num_elements)], reduced_tensor.numpy()) run_and_assert(group_size=2, group_key=1) run_and_assert(group_size=3, group_key=2)
def testKeepLogicalDevice(self): gpus = tf_config.list_physical_devices('GPU') if len(gpus) > 1: self.skipTest( 'Skip logical device test on multi GPUs, since partial GPU ' 'virtualization is not permitted.') # Cannot change logical device after the context initialization. context._reset_context() # pylint: disable=protected-access cluster_spec = multi_worker_test_base.create_cluster_spec( has_chief=False, num_workers=1) resolver = cluster_resolver_lib.SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec( cluster_spec), task_type='worker', task_id=0) logical_gpus = len(gpus) * 2 for i, device in enumerate(gpus): n = (i + 1) * logical_gpus // len(gpus) - i * logical_gpus // len(gpus) assert n > 0 # guaranteed if count >= len(devices) configs = [] for ordinal in range(n): config = context.LogicalDeviceConfiguration( memory_limit=64, experimental_device_ordinal=ordinal) configs.append(config) tf_config.set_logical_device_configuration(device, configs) collective_all_reduce_strategy.CollectiveAllReduceStrategy( cluster_resolver=resolver) # Since we create two logical GPUs out of the last GPU, there should be one # more logical GPUs than physical GPUs. self.assertLen(tf_config.list_logical_devices('GPU'), logical_gpus) context._reset_context() # pylint: disable=protected-access
def configure_virtual_cpus(): cpus = config.list_physical_devices('CPU') # Set 2 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ])
def testCollectiveGroupSizeMismatch(self): cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized()
def setUp(self): super(PackedDistributedVariableTest, self).setUp() cpus = config.list_physical_devices('CPU') # Set 2 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), ])
def setUp(self): super(RpcOpsTest, self).setUp() cpus = config.list_physical_devices("CPU") # Set 2 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ])
def setUp(self): super(LayerCorrectnessTest, self).setUp() # Set two virtual CPUs to test MirroredStrategy with multiple devices cpus = config_module.list_physical_devices('CPU') config_module.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), ])
def testAbortInstanceParamsResolution(self): cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant(1.) def collective_fn(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='ring') # First perform a normal all-reduce to complete the group resolution. def_function.function(collective_fn)() def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down') t = threading.Thread(target=abort_fn) t.start() # Use a different instance key to trigger another instance resolution. instance_key = 101 with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): # This hangs on params resolution since we're only launching one # collective for a group size of 2. collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # Reset the context in order to reset the collective executor. context._reset_context() # pylint: disable=protected-access t.join() # After reset non-NCCL collectives should work. cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) def_function.function(collective_fn)()
def worker_fn(): gpus = config.list_physical_devices('GPU') if gpus: # Set virtual GPU with memory limit of 64MB so that multiple worker # processes can share the physical GPU config.set_logical_device_configuration( gpus[0], [context.LogicalDeviceConfiguration(64)]) for _ in range(100): worker_step_fn()
def _setup_context(self, num_cpus=2): context._reset_context() cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized()
def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(TestMultiGPUModel, self).__init__(methodName) gpu_devices = config.list_physical_devices('GPU') if len(gpu_devices) == 1: # A GPU is available, simulate 2 instead. config.set_logical_device_configuration(gpu_devices[0], [ context.LogicalDeviceConfiguration(500), context.LogicalDeviceConfiguration(500) ])
def setUp(self): super(FunctionGradientsTest, self).setUp() cpus = config.list_physical_devices('CPU') # Set 4 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ])
def setUp(self): super(InteropTest, self).setUp() physical_devices = config.list_physical_devices('CPU') configs = config.get_logical_device_configuration(physical_devices[0]) if configs is None: logical_devices = [ context.LogicalDeviceConfiguration() for _ in range(3) ] config.set_logical_device_configuration(physical_devices[0], logical_devices)
def _setup_context(): context._reset_context() cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized()
def _ensure_context_initialized(self): gpus = config.list_physical_devices('GPU') if len(gpus) < 1: self.skipTest('Expected at least 1 GPU but found {} GPUs'.format( len(gpus))) config.set_logical_device_configuration(gpus[0], [ context.LogicalDeviceConfiguration(1024), context.LogicalDeviceConfiguration(1024) ]) context.ensure_initialized()
def set_up_virtual_devices(): global _virtual_devices_ready if _virtual_devices_ready: return physical_devices = config.list_physical_devices('CPU') config.set_logical_device_configuration(physical_devices[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) _virtual_devices_ready = True
def testExecutionAfterTimeoutV2(self): timeout = 1.5 cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() group_key = 20 instance_key = 30 input_data = constant_op.constant([1, 2, 3, 4]) @def_function.function def run_all_reduce(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout) # Run a normal all-reduce to complete param resolution. run_all_reduce() with self.assertRaisesRegex( errors.DeadlineExceededError, 'Collective has timed out during execution'): with ops.device('CPU:0'): collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout) # We launch the second device after the first device times out. This is to # simulate the situation when other workers are slow and the timeout is # short. It should error immediately. with self.assertRaisesRegex( errors.DeadlineExceededError, 'Collective has timed out during execution'): with ops.device('CPU:1'): # No timeout. collective_ops.all_reduce(input_data, group_size=2, group_key=group_key, merge_op='Add', final_op='Id', instance_key=instance_key)
def testAbortRing(self): cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant(1.) # First perform a normal collective to finish resolution. def collective_fn(): for device in ['CPU:0', 'CPU:1']: with ops.device(device): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id', communication_hint='ring') def_function.function(collective_fn)() # Launch a collective that hangs, and abort the collective executor after # the launch. def abort_fn(): time.sleep(2) context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down') t = threading.Thread(target=abort_fn) t.start() with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # After abortion, subsequent collectives should fail immediately. with self.assertRaisesRegex(errors.UnavailableError, 'peer down'): collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key, 'Add', 'Id') # Reset the context in order to reset the collective executor. t.join() context._reset_context() # pylint: disable=protected-access # After reset non-NCCL collectives should work. cpus = config.list_physical_devices('CPU') config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) def_function.function(collective_fn)()
def setUp(self): super(SaverTest, self).setUp() cpus = config.list_physical_devices("CPU") # Set 3 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) self.local_options = checkpoint_options.CheckpointOptions( experimental_io_device=LOCALHOST)
def __init__(self, methodName="runTest"): # pylint: disable=invalid-name super(LocalReplicateTest, self).__init__(methodName) cpus = config.list_physical_devices("CPU") # Set 3 virtual CPUs config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) self._device0 = "/device:CPU:0" self._device1 = "/device:CPU:1" self._device2 = "/device:CPU:2"
def _mimic_two_cpus(): try: cpus = config.list_physical_devices("CPU") except errors_impl.NotFoundError: # Testing device not available. Skip the test. return False config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration(), ]) return True
def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(TestMultiGPUModel, self).__init__(methodName) gpu_devices = config.list_physical_devices('GPU') xla_gpu_devices = config.list_physical_devices('XLA_GPU') # NOTE: XLA devices don't support the set_logical_device_configuration # codepaths. if len(gpu_devices) == 1 and not xla_gpu_devices: # A GPU is available, simulate 2 instead. config.set_logical_device_configuration(gpu_devices[0], [ context.LogicalDeviceConfiguration(500), context.LogicalDeviceConfiguration(500) ])
def testGpuInvalidConfig(self): gpus = config.list_physical_devices('GPU') self.assertNotEqual(len(gpus), 0) if len(gpus) > 1: # Assert if other GPUs were not configured config.set_memory_growth(gpus[0], True) with self.assertRaisesRegex(ValueError, 'cannot differ'): c = context.context().config # If we limit visibility to GPU 0, growth is fine config.set_visible_devices(gpus[0], 'GPU') c = context.context().config self.assertTrue(c.gpu_options.allow_growth) # Default setting for second GPU is False and works if we set visibility config.set_visible_devices(gpus[1], 'GPU') c = context.context().config self.assertFalse(c.gpu_options.allow_growth) # Growth now fails because all the GPUs are visible and not the same config.set_visible_devices(gpus, 'GPU') with self.assertRaisesRegex(ValueError, 'cannot differ'): c = context.context().config for gpu in gpus: config.set_memory_growth(gpu, True) c = context.context().config self.assertTrue(c.gpu_options.allow_growth) with self.assertRaisesRegex(ValueError, 'memory limit'): config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) self.assertIsNone(config.get_logical_device_configuration(gpus[-1])) config.set_logical_device_configuration(gpus[-1], [ context.LogicalDeviceConfiguration(memory_limit=10), context.LogicalDeviceConfiguration(memory_limit=10) ]) c = context.context().config self.assertFalse(c.gpu_options.allow_growth) with self.assertRaisesRegex(ValueError, 'virtual devices'): config.set_memory_growth(gpus[-1], False)
def set_cpu_logical_devices_to_at_least(num): """Create cpu logical devices of at least a given number.""" physical_devices = config.list_physical_devices('CPU') if not physical_devices: raise RuntimeError('No CPU found') if len(physical_devices) >= num: return # By default each physical device corresponds to one logical device. We create # multiple logical devices for the last physical device so that we have `num` # logical devices. num = num - len(physical_devices) + 1 logical_devices = [] for _ in range(num): logical_devices.append(context.LogicalDeviceConfiguration()) # Create logical devices from the last device since sometimes the first GPU # is the primary graphic card and may have less memory available. config.set_logical_device_configuration(physical_devices[-1], logical_devices)
def testParamResolutionAfterTimeoutV2(self): context._reset_context() timeout = 1.5 cpus = config.list_physical_devices('CPU') self.assertEqual(len(cpus), 1) config.set_logical_device_configuration(cpus[0], [ context.LogicalDeviceConfiguration(), context.LogicalDeviceConfiguration() ]) context.ensure_initialized() group_key = 20 instance_key = 30 input_data = constant_op.constant([1, 2, 3, 4]) # This timeout comes from param solution. with self.assertRaisesRegex( errors.DeadlineExceededError, 'Collective has timed out waiting for other workers'): with ops.device('CPU:0'): collective_ops.all_reduce( input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout) # We launch the second device after the first device times out. This is to # simulate the situation when other workers are slow and the timeout is # short. Since the CPU:0 times out in the param resolution phase, CPU:1 # should times out as well, but in the execute phase. with self.assertRaisesRegex(errors.DeadlineExceededError, 'Collective has timed out during execution'): with ops.device('CPU:1'): collective_ops.all_reduce( input_data, group_size=2, group_key=group_key, instance_key=instance_key, merge_op='Add', final_op='Id', timeout=timeout)
def set_virtual_cpus_to_at_least(num_virtual_cpus): """Create virtual CPU devices if they haven't yet been created.""" if num_virtual_cpus < 1: raise ValueError("`num_virtual_cpus` must be at least 1 not %r" % (num_virtual_cpus,)) physical_devices = config.list_physical_devices("CPU") if not physical_devices: raise RuntimeError("No CPUs found") configs = config.get_logical_device_configuration(physical_devices[0]) if configs is None: logical_devices = [ context.LogicalDeviceConfiguration() for _ in range(num_virtual_cpus) ] config.set_logical_device_configuration(physical_devices[0], logical_devices) else: if len(configs) < num_virtual_cpus: raise RuntimeError("Already configured with %d < %d virtual CPUs" % (len(configs), num_virtual_cpus))