예제 #1
0
def _should_cache():
  """Returns True if a default caching device should be set, otherwise False."""
  if context.executing_eagerly():
    return False
  # Don't set a caching device when running in a loop, since it is possible that
  # train steps could be wrapped in a tf.while_loop. In that scenario caching
  # prevents forward computations in loop iterations from re-reading the
  # updated weights.
  ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
  return control_flow_util.GetContainingWhileContext(ctxt) is None
예제 #2
0
def _should_cache_variables():
  """Returns True if a default caching device should be set, otherwise False."""
  # Don't set a caching device when running in a loop, since it is possible that
  # train steps could be wrapped in a tf.while_loop. In that scenario caching
  # prevents forward computations in loop iterations from re-reading the
  # updated weights.
  graph = tf.get_default_graph()
  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
  in_v1_while_loop = (
      control_flow_util.GetContainingWhileContext(ctxt) is not None)
  return not in_v1_while_loop
def create_quantize_graph(config_file, graph=tf.get_default_graph()):
    """
    全图量化
    :param config: 量化方式配置
    :param graph: 需要量化的graph
    :return: 添加量化算子的graph
    """
    config = Config(config_file)
    ops = graph.get_operations()
    quant_tensors = {}  # 存放已经量化过的tensor,避免重复量化
    quantize_func = quantize_for_train if config.is_training else quantize_for_eval
    quant_layers = set(tf.get_collection("quant_layers"))
    tf.logging.info(
        "-------------------------Quantize-------------------------")

    # 遍历所有op,对前向算子进行量化
    for op in ops:
        if "gradient" not in op.name and op.type in config.forward_quant_ops and op.name not in quant_layers:
            inp1, inp2 = op.inputs._inputs
            tf.logging.info("Forward Quant:%s" % op.name)
            ctxt = util.GetOutputContext(op)
            while_ctxt = util.GetContainingWhileContext(ctxt)
            graph._set_control_flow_context(ctxt)
            quant_inp1 = quantize_func(inp1, config.get_config(op.type, op.name, "input"))\
                if inp1.name not in quant_tensors else quant_tensors[inp1.name]
            quant_inp2 = quantize_func(inp2, config.get_config(op.type, op.name, "weight")) \
                if inp2.name not in quant_tensors else quant_tensors[inp2.name]
            quant_tensors[inp1.name] = quant_inp1
            quant_tensors[inp2.name] = quant_inp2
            tf.contrib.graph_editor.reroute_ts([quant_inp1, quant_inp2],
                                               op.inputs._inputs,
                                               can_modify=op)
            tf.add_to_collection("quant_layers", op.name)

    # 遍历所有op,对所有反向算子进行量化
    for op in ops:
        if "gradient" in op.name and op.type in config.backward_quant_ops and op.name not in quant_layers:
            tf.logging.info("Backward Quant:%s" % op.name)
            new_inputs = []
            for inp in op.inputs._inputs:
                if "ShapeN" not in inp.name:
                    quant_inp = quantize_func(inp, config.get_config("Gradient", op.name, "gradient")) \
                        if inp.name not in quant_tensors else quant_tensors[inp.name]
                    quant_tensors[inp.name] = quant_inp
                    new_inputs.append(quant_inp)
                else:
                    new_inputs.append(inp)
            tf.contrib.graph_editor.reroute_ts(new_inputs,
                                               op.inputs._inputs,
                                               can_modify=op)
            tf.add_to_collection("quant_layers", op.name)

    graph._set_control_flow_context(None)
    return graph
예제 #4
0
def dynamic_decode(
    decoder,
    output_time_major=False,
    impute_finished=False,
    maximum_iterations=None,
    parallel_iterations=32,
    swap_memory=False,
    scope=None,
):
    """Perform dynamic decoding with `decoder`.

  Calls initialize() once and step() repeatedly on the Decoder object.

  Args:
    decoder: A `Decoder` instance.
    output_time_major: Python boolean.  Default: `False` (batch major).  If
      `True`, outputs are returned as time major tensors (this mode is faster).
      Otherwise, outputs are returned as batch major tensors (this adds extra
      time to the computation).
    impute_finished: Python boolean.  If `True`, then states for batch
      entries which are marked as finished get copied through and the
      corresponding outputs get zeroed out.  This causes some slowdown at
      each time step, but ensures that the final state and outputs have
      the correct values and that backprop ignores time steps that were
      marked as finished.
    maximum_iterations: `int32` scalar, maximum allowed number of decoding
       steps.  Default is `None` (decode until the decoder is fully done).
    parallel_iterations: Argument passed to `tf.while_loop`.
    swap_memory: Argument passed to `tf.while_loop`.
    scope: Optional variable scope to use.

  Returns:
    `(final_outputs, final_state, final_sequence_lengths)`.

  Raises:
    TypeError: if `decoder` is not an instance of `Decoder`.
    ValueError: if `maximum_iterations` is provided but is not a scalar.
  """
    if not isinstance(decoder, Decoder):
        raise TypeError(
            "Expected decoder to be type Decoder, but saw: %s" % type(decoder)
        )

    with variable_scope.variable_scope(scope, "decoder") as varscope:
        # Determine context types.
        ctxt = (
            ops.get_default_graph()._get_control_flow_context()
        )  # pylint: disable=protected-access
        is_xla = control_flow_util.GetContainingXLAContext(ctxt) is not None
        is_xla = True  # XLA detection does not work
        in_while_loop = control_flow_util.GetContainingWhileContext(ctxt) is not None
        # Properly cache variable values inside the while_loop.
        # Don't set a caching device when running in a loop, since it is possible
        # that train steps could be wrapped in a tf.while_loop. In that scenario
        # caching prevents forward computations in loop iterations from re-reading
        # the updated weights.
        if not context.executing_eagerly() and not in_while_loop:
            if varscope.caching_device is None:
                varscope.set_caching_device(lambda op: op.device)

        if maximum_iterations is not None:
            maximum_iterations = ops.convert_to_tensor(
                maximum_iterations, dtype=dtypes.int32, name="maximum_iterations"
            )
            if maximum_iterations.get_shape().ndims != 0:
                raise ValueError("maximum_iterations must be a scalar")

        initial_finished, initial_inputs, initial_state = decoder.initialize()

        zero_outputs = _create_zero_outputs(
            decoder.output_size, decoder.output_dtype, decoder.batch_size
        )

        if is_xla and maximum_iterations is None:
            raise ValueError("maximum_iterations is required for XLA compilation.")
        if maximum_iterations is not None:
            initial_finished = math_ops.logical_or(
                initial_finished, 0 >= maximum_iterations
            )
        initial_sequence_lengths = array_ops.zeros_like(
            initial_finished, dtype=dtypes.int32
        )
        initial_time = constant_op.constant(0, dtype=dtypes.int32)

        def _shape(batch_size, from_shape):
            if (not isinstance(from_shape, tensor_shape.TensorShape) or from_shape.ndims == 0):
                return tensor_shape.TensorShape(None)
            else:
                batch_size = tensor_util.constant_value(
                    ops.convert_to_tensor(batch_size, name="batch_size")
                )
                return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)

        dynamic_size = maximum_iterations is None or not is_xla

        def _create_ta(s, d):
            return tensor_array_ops.TensorArray(
                dtype=d,
                size=0 if dynamic_size else maximum_iterations,
                dynamic_size=dynamic_size,
                element_shape=_shape(decoder.batch_size, s),
            )

        initial_outputs_ta = nest.map_structure(
            _create_ta, decoder.output_size, decoder.output_dtype
        )

        def condition(
            unused_time,
            unused_outputs_ta,
            unused_state,
            unused_inputs,
            finished,
            unused_sequence_lengths,
        ):
            # return math_ops.logical_not(math_ops.reduce_all(finished))                                       #Remove this cond
            return True

        def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
            """Internal while_loop body.

      Args:
        time: scalar int32 tensor.
        outputs_ta: structure of TensorArray.
        state: (structure of) state tensors and TensorArrays.
        inputs: (structure of) input tensors.
        finished: bool tensor (keeping track of what's finished).
        sequence_lengths: int32 tensor (keeping track of time of finish).

      Returns:
        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
          next_sequence_lengths)`.
        ```
      """
            (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(
                time, inputs, state
            )
            if decoder.tracks_own_finished:
                next_finished = decoder_finished
            else:
                next_finished = math_ops.logical_or(decoder_finished, finished)
            next_sequence_lengths = array_ops.where(
                math_ops.logical_not(finished),
                array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
                sequence_lengths,
            )

            nest.assert_same_structure(state, decoder_state)
            nest.assert_same_structure(outputs_ta, next_outputs)
            nest.assert_same_structure(inputs, next_inputs)

            # Zero out output values past finish
            if impute_finished:
                emit = nest.map_structure(
                    lambda out, zero: array_ops.where(finished, zero, out),
                    next_outputs,
                    zero_outputs,
                )
            else:
                emit = next_outputs

            # Copy through states past finish
            def _maybe_copy_state(new, cur):
                # TensorArrays and scalar states get passed through.
                if isinstance(cur, tensor_array_ops.TensorArray):
                    pass_through = True
                else:
                    new.set_shape(cur.shape)
                    pass_through = new.shape.ndims == 0
                return new if pass_through else array_ops.where(finished, cur, new)

            if impute_finished:
                next_state = nest.map_structure(_maybe_copy_state, decoder_state, state)
            else:
                next_state = decoder_state

            outputs_ta = nest.map_structure(
                lambda ta, out: ta.write(time, out), outputs_ta, emit
            )
            return (
                time + 1,
                outputs_ta,
                next_state,
                next_inputs,
                next_finished,
                next_sequence_lengths,
            )

        res = control_flow_ops.while_loop(
            condition,
            body,
            loop_vars=(
                initial_time,
                initial_outputs_ta,
                initial_state,
                initial_inputs,
                initial_finished,
                initial_sequence_lengths,
            ),
            parallel_iterations=parallel_iterations,
            maximum_iterations=maximum_iterations,
            swap_memory=swap_memory,
        )

        final_outputs_ta = res[1]
        final_state = res[2]
        final_sequence_lengths = res[5]

        final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)

        try:
            final_outputs, final_state = decoder.finalize(
                final_outputs, final_state, final_sequence_lengths
            )
        except NotImplementedError:
            pass

        if not output_time_major:
            final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)

    return final_outputs, final_state, final_sequence_lengths
예제 #5
0
def dynamic_decode(decoder,
                   output_time_major: bool = False,
                   impute_finished: bool = False,
                   maximum_iterations=None,
                   parallel_iterations: int = 32,
                   swap_memory: bool = False,
                   training=None,
                   scope=None,
                   **kwargs):
    """Perform dynamic decoding with `decoder`.
    Calls initialize() once and step() repeatedly on the Decoder object.
    Args:
      decoder: A `Decoder` instance.
      output_time_major: Python boolean.  Default: `False` (batch major). If
        `True`, outputs are returned as time major tensors (this mode is
        faster). Otherwise, outputs are returned as batch major tensors (this
        adds extra time to the computation).
      impute_finished: Python boolean.  If `True`, then states for batch
        entries which are marked as finished get copied through and the
        corresponding outputs get zeroed out.  This causes some slowdown at
        each time step, but ensures that the final state and outputs have
        the correct values and that backprop ignores time steps that were
        marked as finished.
      maximum_iterations: A strictly positive `int32` scalar, the maximum
         allowed number of decoding steps. Default is `None` (decode until the
         decoder is fully done).
      parallel_iterations: Argument passed to `tf.while_loop`.
      swap_memory: Argument passed to `tf.while_loop`.
      training: Python boolean. Indicates whether the layer should behave
          in training  mode or in inference mode. Only relevant
          when `dropout` or `recurrent_dropout` is used.
      scope: Optional name scope to use.
      **kwargs: dict, other keyword arguments for dynamic_decode. It might
        contain arguments for `BaseDecoder` to initialize, which takes all
        tensor inputs during call().
    Returns:
      `(final_outputs, final_state, final_sequence_lengths)`.
    Raises:
      ValueError: if `maximum_iterations` is provided but is not a scalar.
    """
    with variable_scope.variable_scope(scope, 'decoder') as varscope:
        ctxt = ops.get_default_graph()._get_control_flow_context()
        is_xla = control_flow_util.GetContainingXLAContext(ctxt) is not None
        in_while_loop = (control_flow_util.GetContainingWhileContext(ctxt)
                         is not None)

        if not context.executing_eagerly() and not in_while_loop:
            if varscope.caching_device is None:
                varscope.set_caching_device(lambda op: op.device)

        is_xla = not tf.executing_eagerly(
        ) and control_flow_util.GraphOrParentsInXlaContext(
            tf.compat.v1.get_default_graph())

        if maximum_iterations is not None:
            maximum_iterations = tf.convert_to_tensor(
                maximum_iterations,
                dtype=tf.int32,
                name='maximum_iterations',
            )
            if maximum_iterations.shape.ndims != 0:
                raise ValueError('maximum_iterations must be a scalar')
            tf.debugging.assert_greater(
                maximum_iterations,
                0,
                message='maximum_iterations should be greater than 0',
            )
        elif is_xla:
            raise ValueError(
                'maximum_iterations is required for XLA compilation.')

        if isinstance(decoder, Decoder):
            initial_finished, initial_inputs, initial_state = (
                decoder.initialize())
        else:
            # For BaseDecoder that takes tensor inputs during call.
            decoder_init_input = kwargs.pop('decoder_init_input', None)
            decoder_init_kwargs = kwargs.pop('decoder_init_kwargs', {})
            initial_finished, initial_inputs, initial_state = decoder.initialize(
                decoder_init_input, **decoder_init_kwargs)

        zero_outputs = tf.nest.map_structure(
            lambda shape, dtype: tf.zeros(
                _prepend_batch(decoder.batch_size, shape), dtype=dtype),
            decoder.output_size,
            decoder.output_dtype,
        )

        if maximum_iterations is not None:
            initial_finished = tf.logical_or(initial_finished,
                                             0 >= maximum_iterations)
        initial_sequence_lengths = tf.zeros_like(initial_finished,
                                                 dtype=tf.int32)
        initial_time = tf.constant(0, dtype=tf.int32)

        def _shape(batch_size, from_shape):
            if (not isinstance(from_shape, tf.TensorShape)
                    or from_shape.ndims == 0):
                return None
            else:
                batch_size = tf.get_static_value(
                    tf.convert_to_tensor(batch_size, name='batch_size'))
                return tf.TensorShape([batch_size]).concatenate(from_shape)

        dynamic_size = maximum_iterations is None or not is_xla
        # The dynamic shape `TensoArray` is not allowed in TFLite yet.
        dynamic_size = dynamic_size

        def _create_ta(s, d):
            return tf.TensorArray(
                dtype=d,
                size=0 if dynamic_size else maximum_iterations,
                dynamic_size=dynamic_size,
                element_shape=_shape(decoder.batch_size, s),
            )

        initial_outputs_ta = tf.nest.map_structure(_create_ta,
                                                   decoder.output_size,
                                                   decoder.output_dtype)

        def condition(
            unused_time,
            unused_outputs_ta,
            unused_state,
            unused_inputs,
            finished,
            unused_sequence_lengths,
        ):
            return tf.logical_not(tf.reduce_all(finished))

        def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
            """Internal while_loop body.
            Args:
              time: scalar int32 tensor.
              outputs_ta: structure of TensorArray.
              state: (structure of) state tensors and TensorArrays.
              inputs: (structure of) input tensors.
              finished: bool tensor (keeping track of what's finished).
              sequence_lengths: int32 tensor (keeping track of time of finish).
            Returns:
              `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
                next_sequence_lengths)`.
              ```
            """
            (
                next_outputs,
                decoder_state,
                next_inputs,
                decoder_finished,
            ) = decoder.step(time, inputs, state, training)
            decoder_state_sequence_lengths = False
            if decoder.tracks_own_finished:
                next_finished = decoder_finished
                lengths = getattr(decoder_state, 'lengths', None)
                if lengths is not None:
                    # sequence lengths are provided by decoder_state.lengths;
                    # overwrite our sequence lengths.
                    decoder_state_sequence_lengths = True
                    sequence_lengths = tf.cast(lengths, tf.int32)
            else:
                next_finished = tf.logical_or(decoder_finished, finished)

            if decoder_state_sequence_lengths:
                # Just pass something through the loop; at the next iteration
                # we'll pull the sequence lengths from the decoder_state again.
                next_sequence_lengths = sequence_lengths
            else:
                next_sequence_lengths = tf.where(
                    tf.logical_not(finished),
                    tf.fill(tf.shape(sequence_lengths), time + 1),
                    sequence_lengths,
                )

            tf.nest.assert_same_structure(state, decoder_state)
            tf.nest.assert_same_structure(outputs_ta, next_outputs)
            tf.nest.assert_same_structure(inputs, next_inputs)

            # Zero out output values past finish
            if impute_finished:

                def zero_out_finished(out, zero):
                    if finished.shape.rank < zero.shape.rank:
                        broadcast_finished = tf.broadcast_to(
                            tf.expand_dims(finished, axis=-1), zero.shape)
                        return tf.where(broadcast_finished, zero, out)
                    else:
                        return tf.where(finished, zero, out)

                emit = tf.nest.map_structure(zero_out_finished, next_outputs,
                                             zero_outputs)
            else:
                emit = next_outputs

            # Copy through states past finish
            def _maybe_copy_state(new, cur):
                # TensorArrays and scalar states get passed through.
                if isinstance(cur, tf.TensorArray):
                    pass_through = True
                else:
                    new.set_shape(cur.shape)
                    pass_through = new.shape.ndims == 0
                if not pass_through:
                    broadcast_finished = tf.broadcast_to(
                        tf.expand_dims(finished, axis=-1), new.shape)
                    return tf.where(broadcast_finished, cur, new)
                else:
                    return new

            if impute_finished:
                next_state = tf.nest.map_structure(_maybe_copy_state,
                                                   decoder_state, state)
            else:
                next_state = decoder_state

            outputs_ta = tf.nest.map_structure(
                lambda ta, out: ta.write(time, out), outputs_ta, emit)
            return (
                time + 1,
                outputs_ta,
                next_state,
                next_inputs,
                next_finished,
                next_sequence_lengths,
            )

        res = tf.while_loop(
            condition,
            body,
            loop_vars=(
                initial_time,
                initial_outputs_ta,
                initial_state,
                initial_inputs,
                initial_finished,
                initial_sequence_lengths,
            ),
            parallel_iterations=parallel_iterations,
            maximum_iterations=maximum_iterations,
            swap_memory=swap_memory,
        )

        final_outputs_ta = res[1]
        final_state = res[2]
        final_sequence_lengths = res[5]

        final_outputs = tf.nest.map_structure(lambda ta: ta.stack(),
                                              final_outputs_ta)

        try:
            final_outputs, final_state = decoder.finalize(
                final_outputs, final_state, final_sequence_lengths)
        except NotImplementedError:
            pass

        if not output_time_major:

            final_outputs = tf.nest.map_structure(_transpose_batch_time,
                                                  final_outputs)

    return final_outputs, final_state, final_sequence_lengths
예제 #6
0
def Decoder_Dynamic_Decode(
    decoder,
    output_time_major= False,
    impute_finished= False,
    maximum_iterations= None,
    parallel_iterations= 32,
    swap_memory= False,
    scope= None
    ):
    if not isinstance(decoder, Decoder):
        raise TypeError("Expected decoder to be type Decoder, but saw: %s" % type(decoder))

    with variable_scope.variable_scope(scope, "decoder") as varscope:
        ctxt = tf.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
        is_xla = control_flow_util.GetContainingXLAContext(ctxt) is not None
        in_while_loop = control_flow_util.GetContainingWhileContext(ctxt) is not None

        if not context.executing_eagerly() and not in_while_loop:
            if varscope.caching_device is None:
                varscope.set_caching_device(lambda op: op.device)

        if maximum_iterations is not None:
            maximum_iterations = tf.convert_to_tensor(
                maximum_iterations,
                dtype=tf.int32,
                name="maximum_iterations"
                )
            if maximum_iterations.get_shape().ndims != 0:
                raise ValueError("maximum_iterations must be a scalar")
        elif is_xla:
            raise ValueError("maximum_iterations is required for XLA compilation.")

        initial_finished, initial_inputs, initial_state = decoder.initialize()
        if maximum_iterations is not None:
            initial_finished = tf.logical_or(
                initial_finished,
                0 >= maximum_iterations
                )
        initial_sequence_lengths = tf.zeros_like(initial_finished, dtype=tf.int32)
        initial_time = tf.constant(0, dtype=tf.int32)


        def _shape(batch_size, from_shape):
            if (not isinstance(from_shape, TensorShape) or from_shape.ndims == 0):
                return TensorShape(None)
            else:
                batch_size = tensor_util.constant_value(tf.convert_to_tensor(batch_size, name="batch_size"))
            return TensorShape([batch_size]).concatenate(from_shape)

        dynamic_size = maximum_iterations is None or not is_xla
        def _create_ta(s, d):
            return tf.TensorArray(
                dtype=d,
                size= 0 if dynamic_size else maximum_iterations,
                dynamic_size= dynamic_size,
                element_shape= _shape(decoder.batch_size, s)
                )

        initial_outputs_ta = nest.map_structure(
            _create_ta,
            decoder.output_size,
            decoder.output_dtype
            )

        def condition(
            unused_time,
            unused_outputs_ta,
            unused_state,
            unused_inputs,
            finished,
            unused_sequence_lengths
            ):
            return tf.logical_not(tf.reduce_all(finished))

        def body(
            time,
            outputs_ta,
            state,
            inputs,
            finished,
            sequence_lengths
            ):
            next_outputs, next_state, next_inputs, decoder_finished = decoder.step(time, inputs, state)
            if decoder.tracks_own_finished:
                next_finished = decoder_finished
            else:
                next_finished = tf.logical_or(decoder_finished, finished)   
                next_finished = tf.reshape(next_finished, [-1]) #reshape이유 1: helper에서 cond에 들어가면 merge가 됨, 2: inference시에 2차원 값이 나옴

                
            next_sequence_lengths = tf.where(
                tf.logical_not(finished),
                x= tf.fill(tf.shape(sequence_lengths), time + 1),
                y= sequence_lengths
                )

            nest.assert_same_structure(state, next_state)
            nest.assert_same_structure(outputs_ta, next_outputs)
            nest.assert_same_structure(inputs, next_inputs)

            if impute_finished:
                new_linear = nest.map_structure(
                    lambda out, zero: tf.where(finished, zero, out),
                    next_outputs.linear,
                    tf.zeros_like(next_outputs.linear)
                    )
                next_outputs._replace(linear= new_linear)

                def _maybe_copy_state(new, cur):
                    if isinstance(cur, tf.TensorArray):
                        pass_through = True
                    else:
                        new.set_shape(cur.shape)
                        pass_through = (new.shape.ndims == 0)
                    return new if pass_through else tf.where(finished, cur, new)

                next_state = nest.map_structure(_maybe_copy_state, next_state, state)

            outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), outputs_ta, next_outputs)

            return time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths

        res = tf.while_loop(
            cond= condition,
            body= body,
            loop_vars=[
                initial_time,
                initial_outputs_ta,
                initial_state,
                initial_inputs,
                initial_finished,
                initial_sequence_lengths
                ],
            parallel_iterations=parallel_iterations,
            maximum_iterations=maximum_iterations,
            swap_memory=swap_memory
            )

        final_outputs_ta, final_state, final_sequence_lengths = res[1], res[2], res[5]
        
        final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
        try:
            final_outputs, final_state = decoder.finalize(final_outputs, final_state, final_sequence_lengths)
        except NotImplementedError:
            pass

        if not output_time_major:
            final_outputs = nest.map_structure(rnn._transpose_batch_time, final_outputs)

    return final_outputs, final_state, final_sequence_lengths
예제 #7
0
def _GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
    """Calculate a max_size for use by stack ops inside an XLA while_loop.

  Args:
    value: The value inside the while_loop forward context.  Used for printing
      error messages.
    while_ctxt: The forward context inside which value resides.  This does not
      always match the value's immediate context, as `value` may be inside e.g.
      a cond context inside the while_loop.

  Returns:
    A tensor containing the `max_size` to feed to a Stack initializer.

  Raises:
    ValueError: If `value` is nested inside a `while_loop` that either
      lacks a `maximum_iterations` parameter, or the `maximum_iterations`
      parameter:

        - is inside a `while_loop` that is a parent of the calling context, and
        - cannot be evaluated at graph build time to a constant.
  """
    value_name = value.name
    # curr_ctxt is the context that tf.gradients was called in.
    curr_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access

    curr_ctxt_name = curr_ctxt.name if curr_ctxt is not None else ""
    max_size = constant_op.constant(1)

    # Loop through all containing while contexts between value and the
    # current context, multiplying together each context's
    # max_iterations to get the maximum stack size.
    while while_ctxt not in (None, curr_ctxt):
        max_iter = while_ctxt.maximum_iterations
        if max_iter is None:
            raise ValueError(
                "Cannot create a gradient accumulator for tensor '%s' inside "
                "XLA while_loop because maximum_iterations was not passed to "
                "the tf.while_loop call ('%s')." %
                (value_name, while_ctxt.name))

        # pylint: disable=protected-access
        max_iter_ctxt = max_iter.op._get_control_flow_context()
        # pylint: enable=protected-access

        # If max_iter_ctxt (non-strictly) contains curr_ctxt, then it's OK to use.
        if util.IsContainingContext(curr_ctxt, max_iter_ctxt):
            max_size *= max_iter
        else:
            # We cannot use max_iter because it's defined in a nested while
            # or cond context, so will fail if we try to use it as input to
            # any ops in curr_ctxt (e.g. max_size or the final accumulator
            # stack). Attempt to get a constant value out to use instead.
            const_max_iter = tensor_util.constant_value(max_iter)
            if const_max_iter is None:
                raise ValueError(
                    "Cannot create a gradient accumulator for tensor '%s' inside XLA "
                    "while_loop. maximum_iterations tensor '%s' for while_loop context "
                    "'%s' must be statically known (e.g. a constant value or known "
                    "shape dimension), or be defined at or outside the while loop "
                    "context '%s' (currently defined in '%s')." %
                    (value_name, max_iter.name, while_ctxt.name,
                     curr_ctxt_name, max_iter_ctxt.name))
            max_size *= const_max_iter

        # Find the next outer WhileContext (or stop if we reach the
        # tf.gradient's context).
        while_ctxt = util.GetContainingWhileContext(while_ctxt.outer_context,
                                                    stop_ctxt=curr_ctxt)

    return max_size