def testAccumulatorMultipleAccumulators(self): with self.cached_session() as sess: q_f32_0 = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) q_f32_1 = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) q_f16_0 = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float16, name="Q", shape=tensor_shape.TensorShape([2, 2])) q_f16_1 = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float16, name="Q", shape=tensor_shape.TensorShape([2, 2])) accums = [q_f16_0, q_f16_1, q_f32_0, q_f32_1] elems = [[[1, 0], [0, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[0, 0], [0, 1]]] expected_tensors = [] for i in range(len(accums)): tensor_to_add = np.array(elems[i]).astype(accums[i] .dtype.as_numpy_dtype) expected_tensor = _indexedslice(tensor_to_add) expected_tensors.append(expected_tensor) st = _indexedslice(tensor_to_add) accums[i].apply_indexed_slices_grad(st).run() for i in range(len(accums)): result = sess.run(accums[i].take_indexed_slices_grad(1)) self._assertEqual_indexedslices(expected_tensors[i], result)
def testReturnShape(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator(dtypes_lib.float32, name="Q", shape=[2, None]) q.apply_grad(grad_indices=[0], grad_values=np.array([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]] ]).astype(np.float32)).run() val = self.evaluate(q.take_indexed_slices_grad(1)) self.assertAllEqual(val.dense_shape, [2, 2, 2, 2]) q = data_flow_ops.SparseConditionalAccumulator(dtypes_lib.float32, name="Q", shape=[None, 2]) q.apply_grad(grad_indices=[0], grad_values=np.array([[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] ]).astype(np.float32)).run() val = self.evaluate(q.take_indexed_slices_grad(1)) self.assertAllEqual(val.dense_shape, [-1, 2, 2, 3])
def testEmptyShapeApply(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([])) with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Input indices should be vector"): q.apply_grad(grad_indices=0, grad_values=[1.0], grad_shape=[]).run() with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Input indices should be vector"): q.apply_grad(grad_indices=0, grad_values=[1.0]).run() with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Values cannot be 0-dimensional."): q.apply_grad(grad_indices=[0], grad_values=1.0, grad_shape=[]).run() with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Values cannot be 0-dimensional."): q.apply_grad(grad_indices=[0], grad_values=1.0).run() # The right way to apply a scalar q.apply_grad(grad_indices=[0], grad_values=[1.0], grad_shape=[]).run() q.apply_grad(grad_indices=[0], grad_values=[1.0]).run()
def testAccumulatorApplyAndBlockingTake(self): # We need each thread to keep its own device stack or the device scopes # won't be properly nested. ops.get_default_graph().switch_to_thread_local() with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) elems = [10.0, 20.0, 30.0] elems_ave = sum(elems) / len(elems) accum_ops = [] for x in elems: x = _indexedslice(np.array([[0, x], [0, 0]]).astype(np.float32)) accum_ops.append(q.apply_indexed_slices_grad(x, local_step=0)) takeg_t = q.take_indexed_slices_grad(3) results = [] def apply_indexed_slices_grad(): for accum_op in accum_ops: self.evaluate(accum_op) def take_grad(): results.append(self.evaluate(takeg_t)) accum_thread = self.checkedThread(target=apply_indexed_slices_grad) takeg_thread = self.checkedThread(target=take_grad) accum_thread.start() takeg_thread.start() accum_thread.join() takeg_thread.join() self._assertEqual_nparray([[0, elems_ave], [0, 0]], results[0], sess)
def testAccumulatorTakeGradInvalidReductionType(self): with self.assertRaises(ValueError): data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=(), reduction_type="Invalid")
def testAccumulatorApplyAndBlockingTake(self): with self.test_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) elems = [10.0, 20.0, 30.0] elems_ave = sum(elems) / len(elems) accum_ops = [] for x in elems: x = _indexedslice(np.array([[0, x], [0, 0]]).astype(np.float32)) accum_ops.append(q.apply_indexed_slices_grad(x, local_step=0)) takeg_t = q.take_indexed_slices_grad(3) results = [] def apply_indexed_slices_grad(): for accum_op in accum_ops: sess.run(accum_op) def take_grad(): results.append(sess.run(takeg_t)) accum_thread = self.checkedThread(target=apply_indexed_slices_grad) takeg_thread = self.checkedThread(target=take_grad) accum_thread.start() takeg_thread.start() accum_thread.join() takeg_thread.join() self._assertEqual_nparray([[0, elems_ave], [0, 0]], results[0], sess)
def testParallelApplyGradSum(self): with self.test_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2]), reduction_type="SUM") elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] accum_ops = [] for x in elems: x = _indexedslice(np.array([[x, 0], [0, x]]).astype(np.float32)) accum_ops.append(q.apply_indexed_slices_grad(x, local_step=0)) takeg_t = q.take_indexed_slices_grad(1) def apply_indexed_slices_grad(accum_op): sess.run(accum_op) threads = [ self.checkedThread(target=apply_indexed_slices_grad, args=(o,)) for o in accum_ops ] for thread in threads: thread.start() for thread in threads: thread.join() val = sess.run(takeg_t) expected_val = 550.0 self._assertEqual_nparray( np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32), val, sess)
def testApplyGradtInt32IndicesAndShape(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) accum_op = q.apply_grad( grad_indices=constant_op.constant( [0, 2], dtype=dtypes_lib.int32), grad_values=constant_op.constant( [[0, 0, 1], [3, 0, 4]], dtype=dtypes_lib.float32), grad_shape=constant_op.constant( [3, 3], dtype=dtypes_lib.int32)) accum_op.run() accum_op = q.apply_indexed_slices_grad( indexed_slices.IndexedSlices( indices=constant_op.constant( [0, 2], dtype=dtypes_lib.int32), values=constant_op.constant( [[0, 0, 1], [3, 0, 4]], dtype=dtypes_lib.float32), dense_shape=constant_op.constant( [3, 3], dtype=dtypes_lib.int32))) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 2) val = self.evaluate(q.take_indexed_slices_grad(1)) self.assertAllEqual(val.indices, [0, 2]) self.assertAllEqual(val.values, [[0, 0, 1], [3, 0, 4]]) self.assertAllEqual(val.dense_shape, [3, 3])
def testParallelApplyGradMean(self): # We need each thread to keep its own device stack or the device scopes # won't be properly nested. ops.get_default_graph().switch_to_thread_local() with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] accum_ops = [] for x in elems: x = _indexedslice(np.array([[x, 0], [0, x]]).astype(np.float32)) accum_ops.append(q.apply_indexed_slices_grad(x, local_step=0)) takeg_t = q.take_indexed_slices_grad(1) def apply_indexed_slices_grad(accum_op): self.evaluate(accum_op) threads = [ self.checkedThread( target=apply_indexed_slices_grad, args=(o,)) for o in accum_ops ] for thread in threads: thread.start() for thread in threads: thread.join() val = self.evaluate(takeg_t) expected_val = sum(elems) / len(elems) self._assertEqual_nparray( np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32), val, sess)
def testAccumulatorSetGlobalStep(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1])) set_global_step_op = q.set_global_step(1) set_global_step_op.run()
def testZeroDimensionValues(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "Values cannot be 0-dimensional."): q.apply_grad( grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
def testAccumulatorApplyGradFloat32(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) accum_op = q.apply_indexed_slices_grad( indexed_slices.IndexedSlices( indices=[0, 2], values=np.array([[0, 0, 1], [3, 0, 4]]).astype(np.float32))) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 1)
def testWrongNonEmptyInputValues(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) with self.assertRaisesRegex(errors_impl.InvalidArgumentError, " non-empty input values, got "): q.apply_grad( grad_indices=[0, 1], grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
def testNonVectorIndices(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) with self.assertRaisesRegex( errors_impl.InvalidArgumentError, "Input indices should be vector but received shape:"): q.apply_grad( grad_indices=[[0, 1], [1, 0]], grad_values=np.array([1, 2]).astype(np.float32)).run()
def testConstructor(self): with ops.Graph().as_default(): q = data_flow_ops.SparseConditionalAccumulator(dtypes_lib.float32, name="Q") self.assertTrue(isinstance(q.accumulator_ref, ops.Tensor)) self.assertProtoEquals( """ name:'Q' op:'SparseConditionalAccumulator' attr { key: 'dtype' value { type: DT_FLOAT } } attr { key: 'shape' value { shape { unknown_rank: true} } } attr { key: 'container' value { s: '' } } attr { key: 'shared_name' value { s: '' } } """, q.accumulator_ref.op.node_def)
def _get_accum_apply_and_agg_grad(var_op, grad, indices, dense_shape): if indices is None: tensor = variable_utils.get_read_var_tensor(var_op) grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=tensor.get_shape(), shared_name=var_op.name + "/grad_accum") # Get a copy of consumers list before creating accum_apply_op grad_consumers = list(grad.consumers()) accum_apply_op = grad_accum.apply_grad(grad, local_step=MAX_INT64, name=grad.op.name + '_accum_apply_grad') agg_grad = grad_accum.take_grad(num_accum_required, name=var_op.name + '_take_grad') update_consumers(grad_consumers, grad, agg_grad) update_control_consumers(get_control_consumers(grad.op), grad.op, agg_grad.op) else: grad_indexed_slices = ops.IndexedSlices( values=grad, indices=indices, dense_shape=dense_shape) grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=grad.shape, shared_name=var_op.name + "/grad_accum") # Get a copy of consumers list before creating accum_apply_op indices_consumers = list(indices.consumers()) grad_consumers = list(grad.consumers()) accum_apply_op = grad_accum.apply_indexed_slices_grad( grad_indexed_slices, local_step=MAX_INT64, name=grad.op.name + '_accum_apply_grad') agg_grad = grad_accum.take_indexed_slices_grad( num_accum_required, name=var_op.name + '_take_grad') agg_indices = agg_grad.indices if indices.dtype != agg_grad.indices.dtype: agg_indices = math_ops.cast(agg_grad.indices, indices.dtype) agg_grad = ops.IndexedSlices(values=agg_grad.values, indices=agg_indices, dense_shape=agg_grad.dense_shape) assert isinstance(agg_grad, ops.IndexedSlices) update_consumers(indices_consumers, indices, agg_grad.indices) update_consumers(grad_consumers, grad, agg_grad.values) update_control_consumers(get_control_consumers(indices.op), indices.op, agg_grad.indices.op) update_control_consumers(get_control_consumers(grad.op), grad.op, agg_grad.values.op) return accum_apply_op, agg_grad
def testDynamicWrongNonEmptyInputValues(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) x_indices = array_ops.placeholder(dtypes_lib.int64) x_values = array_ops.placeholder(dtypes_lib.float32) accum_op = q.apply_grad(grad_indices=x_indices, grad_values=x_values) with self.assertRaisesRegex(errors_impl.InvalidArgumentError, " non-empty input values, got "): sess.run(accum_op, feed_dict={ x_indices: [0, 1], x_values: np.array([[0, 1, 1]]).astype(np.float32) })
def testParallelTakeGrad(self): # We need each thread to keep its own device stack or the device scopes # won't be properly nested. ops.get_default_graph().switch_to_thread_local() with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([2, 2])) elems = [e + 1 for e in range(10)] accum_ops = [] for e in elems: v = _indexedslice( np.array([[0, 0], [e, 0]]).astype(np.float32)) accum_ops.append( q.apply_indexed_slices_grad(v, local_step=e - 1)) takeg_t = q.take_indexed_slices_grad(1) results = [] def apply_indexed_slices_grad(): for accum_op in accum_ops: time.sleep(1.0) sess.run(accum_op) apply_indexed_slices_grad_thread = self.checkedThread( target=apply_indexed_slices_grad) def take_grad(): t = sess.run(takeg_t) results.append(t) threads = [self.checkedThread(target=take_grad) for _ in range(10)] for thread in threads: thread.start() apply_indexed_slices_grad_thread.start() for thread in threads: thread.join() apply_indexed_slices_grad_thread.join() for i in range(len(accum_ops)): self._assertEqual_nparray(np.array([[0, 0], [elems[i], 0]]), results[i], sess)
def testDynamicNonVectorIndices(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([3, 3])) x_indices = array_ops.placeholder(dtypes_lib.int64) x_values = array_ops.placeholder(dtypes_lib.float32) accum_op = q.apply_grad(grad_indices=x_indices, grad_values=x_values) with self.assertRaisesRegex( errors_impl.InvalidArgumentError, "Input indices should be vector but received shape:"): sess.run(accum_op, feed_dict={ x_indices: [[0, 1], [1, 0]], x_values: np.array([1, 2]).astype(np.float32) })
def testAccumulatorCancel(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1, 2, 3])) takeg_t = q.take_indexed_slices_grad(1) takeg_thread = self.checkedThread(self._blocking_takeg, args=(sess, takeg_t)) takeg_thread.start() time.sleep(1.0) sess.close() # Will cancel blocked operation takeg_thread.join()
def _aggregate_sparse_gradients(self, var_op, reduce_to_device, indexed_slices_grads, values_op_name): with ops.device(reduce_to_device): grad_accum_op_name = ops.prepend_name_scope( values_op_name, u"%sAccum" % AUTODIST_PREFIX) grad_accum = data_flow_ops.SparseConditionalAccumulator( dtype=indexed_slices_grads[0].values.dtype, shape=var_op.outputs[0].shape, shared_name=grad_accum_op_name, name=grad_accum_op_name) accum_apply_ops = [ grad_accum.apply_indexed_slices_grad( indexed_slices_grads[i], MAX_INT64, name=ops.prepend_name_scope( values_op_name, u"%s-Accum-Apply" % replica_prefix(i))) for i in range(self.num_replicas) ] take_grad_op_name = ops.prepend_name_scope( values_op_name, u"%sTake-Grad" % AUTODIST_PREFIX) with ops.control_dependencies(accum_apply_ops): take_grad = grad_accum.take_indexed_slices_grad( self.num_replicas, name=take_grad_op_name) new_indices = take_grad.indices new_values = take_grad.values new_dense_shape = take_grad.dense_shape if indexed_slices_grads[0].indices.dtype != new_indices.dtype: new_indices = math_ops.cast( new_indices, indexed_slices_grads[0].indices.dtype, name=ops.prepend_name_scope( values_op_name, u"%sTake-Grad-Cast-Indices" % AUTODIST_PREFIX)) if indexed_slices_grads[ 0].dense_shape.dtype != new_dense_shape.dtype: new_dense_shape = math_ops.cast( new_dense_shape, indexed_slices_grads[0].dense_shape.dtype, name=ops.prepend_name_scope( values_op_name, u"%sTake-Grad-Cast-Shape" % AUTODIST_PREFIX)) return ops.IndexedSlices(new_values, new_indices, new_dense_shape)
def testAccumulatorRepeatedTakeGrad(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator(dtypes_lib.float32, name="Q", shape=()) grad_indexed_slices = ops.IndexedSlices( indices=[0, 1], values=np.array([[1, 0], [0, 2]]).astype(np.float32)) accum_op = q.apply_indexed_slices_grad(grad_indexed_slices, local_step=0) accum_op.run() accum_op = q.apply_grad([0, 2], np.array([[0, 1], [3, 0]]).astype(np.float32), [3, 2], local_step=0) accum_op.run() takeg_t = q.take_indexed_slices_grad(1) val = self.evaluate(takeg_t) self.assertAllEqual(val.indices, [0, 1, 2]) self.assertAllEqual(val.values, [[0.5, 0.5], [0, 2], [3, 0]]) self.assertAllEqual(val.dense_shape, [-1, 2]) grad_indexed_slices = ops.IndexedSlices( indices=[0, 1], values=np.array([[10, 0], [0, 20]]).astype(np.float32)) accum_op = q.apply_indexed_slices_grad(grad_indexed_slices, local_step=1) accum_op.run() accum_op = q.apply_grad([0, 2], np.array([[0, 10], [30, 0]]).astype(np.float32), [3, 2], local_step=1) accum_op.run() takeg_t = q.take_indexed_slices_grad(1) val = self.evaluate(takeg_t) self.assertAllEqual(val.indices, [0, 1, 2]) self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]]) self.assertAllEqual(val.dense_shape, [-1, 2])
def testAccumulatorTakeGradSum(self): with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=(), reduction_type="SUM") grad_indexed_slices = indexed_slices.IndexedSlices( indices=[0, 1], values=np.array([[1, 0], [0, 2]]).astype(np.float32)) accum_op = q.apply_indexed_slices_grad(grad_indexed_slices) accum_op.run() accum_op = q.apply_grad([0, 2], np.array([[0, 1], [3, 0]]).astype(np.float32), [3, 2]) accum_op.run() takeg_t = q.take_indexed_slices_grad(1) val = self.evaluate(takeg_t) self.assertAllEqual([0, 1, 2], val.indices) self.assertAllEqual([[1, 1], [0, 2], [3, 0]], val.values) self.assertAllEqual([-1, 2], val.dense_shape)
def testConstructorWithShape(self): with ops.Graph().as_default(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1, 5, 2, 8])) self.assertTrue(isinstance(q.accumulator_ref, ops.Tensor)) self.assertProtoEquals( """ name:'Q' op:'SparseConditionalAccumulator' attr { key: 'dtype' value { type: DT_FLOAT } } attr { key: 'shape' value { shape { dim {size: 1 } dim {size: 5 } dim {size: 2 } dim {size: 8 } } } } attr { key: 'container' value { s: '' } } attr { key: 'shared_name' value { s: '' } } """, q.accumulator_ref.op.node_def)
def testAccumulatorCancel(self): # We need each thread to keep its own device stack or the device scopes # won't be properly nested. ops.get_default_graph().switch_to_thread_local() with self.cached_session() as sess: q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1, 2, 3])) takeg_t = q.take_indexed_slices_grad(1) takeg_thread = self.checkedThread( self._blocking_takeg, args=(sess, takeg_t)) takeg_thread.start() time.sleep(1.0) sess.close() # Will cancel blocked operation takeg_thread.join()
def testDtypes(self): with self.cached_session() as sess: dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64] for i in range(len(dtypes)): dtype = dtypes[i] q = data_flow_ops.SparseConditionalAccumulator( dtype, shape=tensor_shape.TensorShape([3, 3, 3])) elems = np.arange(2) sum_elems = np.zeros([3, 3, 3]).astype(dtype.as_numpy_dtype) for e in elems: mat_to_add = np.zeros([3, 3, 3]).astype(dtype.as_numpy_dtype) mat_to_add[i, i, i] = e + 1 sum_elems += mat_to_add t = _indexedslice(mat_to_add) q.apply_indexed_slices_grad(t).run() result = self.evaluate(q.take_indexed_slices_grad(1)) self._assertEqual_nparray(sum_elems / len(elems), result, sess)
def _apply_model_average(self, lvars_and_gvars, name=None): """Apply local weights to global variables. This contains most of the synchronization implementation. Args: lvars_and_gvars: List of (local_vars, global_vars) pairs. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the lvars_and_gvars is empty. """ if not lvars_and_gvars: raise ValueError("Must supply at least one variable") train_ops = [] aggregated_lvars = [] model_reassign_ops = [] global_vars = [g for v, g in lvars_and_gvars if v is not None] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. with ops.colocate_with(local_anchor): self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=self._global_step.dtype.base_dtype, name="%s_local_step" % self._name) self.local_step_init_op = state_ops.assign(self._local_step, self._global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for lvar, gvar in lvars_and_gvars: lvar = ops.convert_to_tensor(lvar) with ops.device(gvar.device): # Dense variables. if lvar is None: aggregated_lvars.append(None) # pass-through. continue elif isinstance(lvar, ops.Tensor): lvar_accum = data_flow_ops.ConditionalAccumulator( lvar.dtype, shape=gvar.get_shape(), shared_name=gvar.name + "/lvar_accum") train_ops.append( lvar_accum.apply_grad(lvar, local_step=self._local_step)) aggregated_lvars.append( lvar_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(lvar, ops.IndexedSlices): raise ValueError("Unknown model variable type!") lvar_accum = data_flow_ops.SparseConditionalAccumulator( lvar.dtype, shape=(), shared_name=gvar.name + "/model_variable_accum") train_ops.append( lvar_accum.apply_indexed_slices_grad( lvar, local_step=self._local_step)) aggregated_lvars.append( lvar_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((lvar_accum, gvar.device)) # sync_op will be assigned to the same device as the global step. with ops.device(self._global_step.device), ops.name_scope(""): for avg_var, gvar in zip(aggregated_lvars, global_vars): model_reassign_ops.append(state_ops.assign(gvar, avg_var)) model_reassign_ops.append( state_ops.assign_add(self._global_step, 1)) update_op = control_flow_ops.group(*(model_reassign_ops)) # Create token queue. with ops.device(self._global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, self._global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(self._global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], self._global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(self._global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._average_applied = True return train_op
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. distribution_strategy = ( distribution_strategy_context.get_distribution_strategy()) with distribution_strategy.colocate_vars_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_grad(grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op
def testAccumulatorSizeEmpty(self): with self.cached_session(): q = data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q") self.assertEqual(q.num_accumulated().eval(), 0)
def testConstructorWithInvalidArg(self): with ops.Graph().as_default(): with self.assertRaises(ValueError): data_flow_ops.SparseConditionalAccumulator( dtypes_lib.float32, name="Q", reduction_type="Invalid")