예제 #1
0
    def _test_lstm_helper(self, num_units, input_size, num_layers, direction):
        with self.session(use_gpu=True) as sess:
            random_seed.set_random_seed(0)
            np.random.seed(0)

            num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
            format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
                num_layers, num_units, input_size, direction=direction)

            ws, bs = [], []
            for _ in range(num_layers * num_dirs):
                w = constant_op.constant(np.random.rand(
                    input_size + num_units, 4 * num_units),
                                         dtype=dtypes.float32)
                b = constant_op.constant(np.random.rand(4 * num_units),
                                         dtype=dtypes.float32)
                ws.append(w)
                bs.append(b)

            opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
            opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
                cudnn_rnn_ops.CUDNN_LSTM,
                num_layers,
                num_units,
                input_size,
                direction=direction)

            ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)

            # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
            # returns the original input.
            ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
            for w, w_r in zip(ws, ws_r):
                self.assertAllClose(w, w_r)
            for b, b_r in zip(bs, bs_r):
                self.assertAllClose(b, b_r)

            # Test opaque_params size lower bound
            opaque_params_size_v = sess.run(opaque_params_size)
            min_params_size = sum(x.size for x in ws) + np.sum(x.size
                                                               for x in bs)
            logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
                         min_params_size, opaque_params_size_v)
            self.assertLessEqual(min_params_size, opaque_params_size_v)
예제 #2
0
def RunLSTM(sess,
            num_units,
            input_size,
            batch_size,
            time,
            num_layers=1,
            variable_seq_lengths=False,
            time_major=True,
            dynamic_shape_input=False,
            is_training=True,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)
  initial_c_op = variable_scope.get_variable(
      "initial_c_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)

  with variable_scope.variable_scope("test", initializer=initializer):
    w = variable_scope.get_variable(
        "rnn/lstm_cell/kernel",
        shape=[input_size + num_units, num_units * 4],
        dtype=dtype)
    b = variable_scope.get_variable(
        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)

    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
    outputs_op, state_tuple_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=rnn_cell_impl.LSTMStateTuple(
            h=initial_h_op, c=initial_c_op),
        dtype=dtype,
        time_major=time_major,
        scope=None)

  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque([w, b])

  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_initial_c_op = array_ops.expand_dims(
      initial_c_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      cu_initial_c_op,
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
  # Remove the trivial 1st dimension.
  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
      c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1),
      h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1))

  if is_training:
    (inp_grad_op, hgrad_op,
     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b])

    (cu_inp_grad_op, cu_hgrad_op,
     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
         cu_outputs_op,
         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
    # Remove the trivial 1st dimension
    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    cu_wgrad_op = cu_wgrad_op[0]
    cu_bgrad_op = cu_bgrad_op[0]
    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
    # the two biases are summed into one. Thus here bias gradient should be
    # halved when comparing with tf lstm.
    cu_bgrad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
        outputs_op, state_tuple_op, inp_grad_op,
        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
    ])
    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
     cu_bgrad) = sess.run(
         [
             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
         ],
         feed_dict={inputs: inputs_np} if dynamic_shape_input else None)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "state_grad: %s" % str(state_grad))
    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
            cu_bgrad)
  else:
    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op],
                                          feed_dict=({
                                              inputs: inputs_np
                                          } if dynamic_shape_input else None))

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
  return outputs, cu_outputs, state_tuple, cu_state_tuple