def testAggregateTensors(self): t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]]) t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]]) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) result = cross_device_utils.aggregate_tensors_or_indexed_slices( [t0, t1]) self._assert_values_equal(total, result)
def _test_reduce_indexed_slices(self, task_type, task_id, num_gpus, communication, batch_reduce, variable_length, local_mode=False): collective_all_reduce, devices, master_target = self._get_test_objects( task_type, task_id, num_gpus, communication=communication, local_mode=local_mode) if local_mode: num_workers = 1 worker_device = None else: num_workers = len(self._cluster_spec.get("chief", [])) + len( self._cluster_spec.get("worker", [])) worker_device = "/job:%s/task:%d" % (task_type, task_id) with ops.Graph().as_default(), \ ops.device(worker_device), \ self.cached_session(target=master_target) as sess: per_replica = self._get_indexed_slices( devices, (task_id or 0) * max(num_gpus, 1), variable_length) if batch_reduce: result = collective_all_reduce.batch_reduce( reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0] else: result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM, per_replica, per_replica) if num_gpus > 1: self.assertIsInstance(result, value_lib.Mirrored) run_options = config_pb2.RunOptions() run_options.experimental.collective_graph_key = 7 if num_gpus > 1: result = sess.run( [ops.convert_to_tensor(v) for v in result.values], options=run_options)[0] else: result = sess.run(ops.convert_to_tensor(result), options=run_options) # Reduce the same indexed slices on CPU locally as our expected results. devices_cpu = [(worker_device or "") + "/device:CPU:0" ] * (max(num_gpus, 1) * num_workers) per_replica_on_cpu = self._get_indexed_slices(devices_cpu, 0, variable_length, as_per_replica=False) expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices( per_replica_on_cpu) expected_result = sess.run(ops.convert_to_tensor(expected_result)) self.assertAllEqual(expected_result, result)
def testAggregateIndexedSlices(self): t0 = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1]) self.assertIsInstance(result, ops.IndexedSlices) self._assert_values_equal(total, result)
def testAggregateIndexedSlices(self): t0 = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1]) self.assertIsInstance(result, ops.IndexedSlices) self._assert_values_equal(total, result)
def _test_reduce_indexed_slices(self, task_type, task_id, num_gpus, batch_reduce, local_mode=False): collective_all_reduce, devices, master_target = self._get_test_objects( task_type, task_id, num_gpus, local_mode=local_mode) if local_mode: num_workers = 1 worker_device = None else: num_workers = len(self._cluster_spec.get("chief", [])) + len( self._cluster_spec.get("worker", [])) worker_device = "/job:%s/task:%d" % (task_type, task_id) with ops.Graph().as_default(), \ ops.device(worker_device), \ self.cached_session(target=master_target) as sess: per_replica = self._get_indexed_slices(devices, (task_id or 0) * max(num_gpus, 1)) if batch_reduce: result = collective_all_reduce.batch_reduce( reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0] else: result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM, per_replica, per_replica) self.assertIsInstance(result, value_lib.Mirrored) run_options = config_pb2.RunOptions() run_options.experimental.collective_graph_key = 7 result = sess.run([ops.convert_to_tensor(v) for v in result.values], options=run_options)[0] # Reduce the same indexed slices on CPU locally as our expected results. devices_cpu = [(worker_device or "") + "/device:CPU:0"] * ( max(num_gpus, 1) * num_workers) per_replica_on_cpu = self._get_indexed_slices( devices_cpu, 0, as_per_replica=False) expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices( per_replica_on_cpu) expected_result = sess.run(ops.convert_to_tensor(expected_result)) self.assertAllEqual(expected_result, result) return True
def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn, reduce_op): # pylint: disable=g-missing-docstring all_values = per_replica_value.values if not all_values: raise ValueError("`per_replica_value` must be non-empty") count = len(all_values) with ops.device(reduce_to_device): with context.device_policy(context.DEVICE_PLACEMENT_SILENT): reduced = cross_device_utils.aggregate_tensors_or_indexed_slices( all_values, accumulation_fn) if reduce_op == reduce_util.ReduceOp.MEAN: reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices( reduced, count) elif reduce_op != reduce_util.ReduceOp.SUM: raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.") return reduced
def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn, reduce_op): # pylint: disable=g-missing-docstring all_values = per_replica_value.values if not all_values: raise ValueError("`per_replica_value` must be non-empty") count = len(all_values) with ops.device(reduce_to_device): with context.device_policy(context.DEVICE_PLACEMENT_SILENT): reduced = cross_device_utils.aggregate_tensors_or_indexed_slices( all_values, accumulation_fn) if reduce_op == reduce_util.ReduceOp.MEAN: reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices( reduced, count) elif reduce_op != reduce_util.ReduceOp.SUM: raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.") return reduced
def testAggregateTensors(self): t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]]) t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]]) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1]) self._assert_values_equal(total, result)