Пример #1
0
 def _forward(self, inputs, h, c, opaque_params, training):
     output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn(  # pylint:disable=protected-access
         inputs,
         h,
         c,
         opaque_params,
         training,
         self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
         seed=self._seed)
     return output, (output_h, output_c)
Пример #2
0
 def _forward(self, inputs, h, c, opaque_params, training):
   output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn(  # pylint:disable=protected-access
       inputs,
       h,
       c,
       opaque_params,
       training,
       self._rnn_mode,
       input_mode=self._input_mode,
       direction=self._direction,
       dropout=self._dropout,
       seed=self._seed)
   return output, (output_h, output_c)
Пример #3
0
def RunLSTM(sess,
            num_units,
            input_size,
            batch_size,
            time,
            num_layers=1,
            variable_seq_lengths=False,
            time_major=True,
            dynamic_shape_input=False,
            is_training=True,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)
  initial_c_op = variable_scope.get_variable(
      "initial_c_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)

  with variable_scope.variable_scope("test", initializer=initializer):
    w = variable_scope.get_variable(
        "rnn/lstm_cell/kernel",
        shape=[input_size + num_units, num_units * 4],
        dtype=dtype)
    b = variable_scope.get_variable(
        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)

    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
    outputs_op, state_tuple_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=rnn_cell_impl.LSTMStateTuple(
            h=initial_h_op, c=initial_c_op),
        dtype=dtype,
        time_major=time_major,
        scope=None)

  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque([w, b])

  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_initial_c_op = array_ops.expand_dims(
      initial_c_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      cu_initial_c_op,
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
  # Remove the trivial 1st dimension.
  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
      c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1),
      h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1))

  if is_training:
    (inp_grad_op, hgrad_op,
     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b])

    (cu_inp_grad_op, cu_hgrad_op,
     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
         cu_outputs_op,
         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
    # Remove the trivial 1st dimension
    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    cu_wgrad_op = cu_wgrad_op[0]
    cu_bgrad_op = cu_bgrad_op[0]
    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
    # the two biases are summed into one. Thus here bias gradient should be
    # halved when comparing with tf lstm.
    cu_bgrad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
        outputs_op, state_tuple_op, inp_grad_op,
        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
    ])
    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
     cu_bgrad) = sess.run(
         [
             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
         ],
         feed_dict={inputs: inputs_np} if dynamic_shape_input else None)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "state_grad: %s" % str(state_grad))
    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
            cu_bgrad)
  else:
    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op],
                                          feed_dict=({
                                              inputs: inputs_np
                                          } if dynamic_shape_input else None))

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
  return outputs, cu_outputs, state_tuple, cu_state_tuple
Пример #4
0
def RunGRU(sess,
           num_units,
           input_size,
           batch_size,
           time,
           num_layers=1,
           is_training=True,
           variable_seq_lengths=False,
           time_major=True,
           dynamic_shape_input=False,
           dropout=0.,
           num_dirs=True,
           dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)
  with variable_scope.variable_scope("test", initializer=initializer):
    gate_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/kernel",
        shape=[input_size + num_units, num_units * 2],
        dtype=dtype)
    gate_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/bias",
        shape=[num_units * 2],
        dtype=dtype)
    candidate_inp_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
        shape=[input_size, num_units],
        dtype=dtype)
    candidate_inp_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
        shape=[num_units],
        dtype=dtype)
    candidate_hid_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
        shape=[num_units, num_units],
        dtype=dtype)
    candidate_hid_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
        shape=[num_units],
        dtype=dtype)

    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
    outputs_op, h_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=initial_h_op,
        dtype=dtype,
        time_major=time_major,
        scope=None)

  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)


  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      array_ops.zeros_like(cu_initial_h_op),  # not used
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)

  if is_training:
    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op] + ws + bs)

    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
    # cudnn gru has 2 biases for reset and update gates. When converting to tf
    # canonical format, the two biases are summed into one.  Thus here relevant
    # bias gradient should be halved before comparing with tf gru.
    cu_gb_grad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
        outputs_op, h_op, inp_grad_op, hgrad_op,
        (gk_grad_op, cik_grad_op, chk_grad_op),
        (gb_grad_op, cib_grad_op, chb_grad_op)
    ])
    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run(
        [
            cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
            (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
            (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
        ],
        feed_dict={inputs: inputs_np} if dynamic_shape_input else None)
    # Remove the trivial 1st dimension
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "hgrad: %s" % hgrad)
    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
  else:
    outputs, h = sess.run([outputs_op, h_op])
    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op],
                                feed_dict=({
                                    inputs: inputs_np
                                } if dynamic_shape_input else None))
    # Remove the trivial 1st dimension.
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
  return outputs, cu_outputs, h, cu_h