def testBiasVec(self): with self.assertRaises(ValueError): nn_ops.bias_add( array_ops.reshape( [1, 2], shape=[1, 2]), array_ops.reshape( [1, 2], shape=[1, 2]))
def _testGradient(self, np_input, bias, dtype, data_format, use_gpu): with self.test_session(use_gpu=use_gpu): if data_format == "NCHW": np_input = self._NHWCToNCHW(np_input) input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=dtype) bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient( input_tensor, np_input.shape, output_tensor, np_input.shape) bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient( bias_tensor, bias.shape, output_tensor, np_input.shape) # Test gradient of BiasAddGrad bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, np_input.shape, bias_add_grad, bias.shape) if dtype == np.float16: # Compare fp16 theoretical gradients to fp32 numerical gradients, # since fp16 numerical gradients are too imprecise unless great # care is taken with choosing the inputs and the delta. This is # a weaker check (in particular, it does not test the op itself, # only its gradient), but it's much better than nothing. input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=np.float32) bias_tensor = constant_op.constant( bias, shape=bias.shape, dtype=np.float32) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) _, tensor_jacob_n = gradient_checker.compute_gradient(input_tensor, np_input.shape, output_tensor, np_input.shape) _, bias_jacob_n = gradient_checker.compute_gradient(bias_tensor, bias.shape, output_tensor, np_input.shape) bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] _, grad_jacob_n = gradient_checker.compute_gradient(output_tensor, np_input.shape, bias_add_grad, bias.shape) threshold = 2e-3 if dtype == dtypes.float64: threshold = 1e-10 self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold) # TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW # once we figure out why this check started failing with cuda mavx. if dtype == dtypes.float64 or data_format != "NCHW": self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold) self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel, padding, strides, side_input_scale, side_input, biases): """Simulates the int8 fused 2-D convolution op using separate float ops. The arguments and return values have the same format, meanings and restrictions as the actual op. Args: conv_input_scale: A scalar 'float'. conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout. padding: A `string` from: `"SAME", "VALID"`. strides: A list of `ints`. side_input_scale: A scalar 'float'. side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. biases: A `Tensor` of type `float32` in NCHW layout. Returns: A `Tensor` of type `qint8` in NCHW_VECT_C layout. """ conv_result = nn_ops.conv2d( NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)), OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)), strides=strides, padding=padding, data_format="NCHW") * conv_input_scale conv_and_side_inputs = conv_result + side_input_scale * NchwVectCToNchw( gen_array_ops.dequantize(side_input, -128, 127)) logit = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW") result, _, _ = gen_array_ops.quantize_v2( NchwToNchwVectC(nn_ops.relu(logit)), -128, 127, dtypes.qint8) return result
def _testBiasNCHW(self, np_inputs, np_bias, use_gpu): np_val = self._npBias(np_inputs, np_bias) np_inputs = self._NHWCToNCHW(np_inputs) with self.cached_session(use_gpu=use_gpu): tf_val = nn_ops.bias_add(np_inputs, np_bias, data_format="NCHW").eval() tf_val = self._NCHWToNHWC(tf_val) self.assertAllCloseAccordingToType(self._AtLeast3d(np_val), tf_val)
def call(self, inputs, state): """Most basic RNN: output = new_state = act(W * input + U * state + B).""" gate_inputs = math_ops.matmul( array_ops.concat([inputs, state], 1), self._kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) output = self._activation(gate_inputs) return output, output
def _linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias; None by default. kernel_initializer: starting value to initialize the weight; None by default. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable( _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def _linear(self, args, copy): out_size = copy * self._num_units proj_size = args.get_shape()[-1] weights = vs.get_variable("kernel", [proj_size, out_size]) out = math_ops.matmul(args, weights) if not self._layer_norm: bias = vs.get_variable("bias", [out_size]) out = nn_ops.bias_add(out, bias) return out
def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias, strides, padding, activation_mode, data_format, dtype): """Verifies the output values of the convolution function. Args: tensor_in_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. bias: 1-D bias tensor of length output_depth. strides: Stride: [col_stride, row_stride] padding: Padding type. activation_mode: Activation mode. data_format: Format of the data tensors. dtype: Data type for inputs and outputs. Returns: Symbolic tensor value and reference value that can be used to execute the computation and verify the results. """ input_size = np.prod(tensor_in_sizes) filter_size = np.prod(filter_in_sizes) bias_size = filter_in_sizes[-1] # equals to output depth # Initializes the input tensor with array containing incrementing # numbers from 1. x1 = [f * 1.0 for f in range(1, input_size + 1)] x2 = [f * 1.0 for f in range(1, filter_size + 1)] # This is to guarantee that there is always negative values after # bias add so that we can test whether relu works correctly. x3 = bias with self.test_session(use_gpu=True): t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype) t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype) t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype) strides = [1] + strides + [1] if data_format == "NCHW": t1 = test_util.NHWCToNCHW(t1) strides = test_util.NHWCToNCHW(strides) output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( t1, t2, t3, strides=strides, padding=padding, data_format=data_format, activation_mode=activation_mode) ref_conv_output = nn_ops.conv2d( t1, t2, strides=strides, padding=padding, data_format=data_format) ref_bias_output = nn_ops.bias_add( ref_conv_output, t3, data_format=data_format) ref_output = nn_ops.relu(ref_bias_output) if data_format == "NCHW": output = test_util.NCHWToNHWC(output) ref_output = test_util.NCHWToNHWC(ref_output) return output, ref_output
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" gate_inputs = math_ops.matmul( array_ops.concat([inputs, state], 1), self._gate_kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias) value = math_ops.sigmoid(gate_inputs) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r_state = r * state candidate = math_ops.matmul( array_ops.concat([inputs, r_state], 1), self._candidate_kernel) candidate = nn_ops.bias_add(candidate, self._candidate_bias) c = self._activation(candidate) new_h = u * state + (1 - u) * c return new_h, new_h
def build_conv_bias_relu_graph(device, input_shape, filter_shape, strides, padding, num_iters, data_format): """builds a graph containing a sequence of conv2d operations. Args: device: String, the device to run on. input_shape: Shape of the input tensor. filter_shape: Shape of the filter tensor. strides: A list of ints. 1-D of length 4. The stride of sliding window for each dimension of input. padding: A string from: "SAME", "VALID". The type of padding algorithm to use. num_iters: number of iterations to run conv2d. data_format: data format string of input, 'NHWC' and 'NCHW' are supported. Returns: An array of tensors to run() """ if data_format == "NCHW": input_shape = [ input_shape[0], input_shape[3], input_shape[1], input_shape[2] ] with ops.device("/%s:0" % device): inp = variables.Variable(random_ops.truncated_normal(input_shape)) filt = variables.Variable(random_ops.truncated_normal(filter_shape)) bias_shape = [filter_shape[-1]] bias = variables.Variable(random_ops.truncated_normal(bias_shape)) outputs = [] conv2d_out = nn_ops.conv2d( inp, filt, strides, padding, data_format=data_format) bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format) relu_out = nn_ops.relu(bias_out) outputs.append(relu_out) for _ in range(1, num_iters): with ops.control_dependencies([relu_out]): conv2d_out = nn_ops.conv2d( inp, filt, strides, padding, data_format=data_format) bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format) relu_out = nn_ops.relu(bias_out) outputs.append(relu_out) return control_flow_ops.group(*outputs)
def __call__(self, args): if not self._is_sequence: args = [args] if len(args) == 1: res = math_ops.matmul(args[0], self._weights) else: res = math_ops.matmul(array_ops.concat(args, 1), self._weights) if self._build_bias: res = nn_ops.bias_add(res, self._biases) return res
def testGradients(self): with ops.Graph().as_default(): inp = constant(1.0, shape=[32, 100], name="in") w = constant(1.0, shape=[100, 10], name="w") b = constant(1.0, shape=[10], name="b") xw = math_ops.matmul(inp, w, name="xw") h = bias_add(xw, b, name="h") w_grad = gradients.gradients(h, w)[0] self.assertEquals("MatMul", w_grad.op.type) self.assertEquals(w_grad.op._original_op, xw.op) self.assertTrue(w_grad.op.get_attr("transpose_a")) self.assertFalse(w_grad.op.get_attr("transpose_b"))
def __call__(self, args): if not self._is_sequence: args = [args] if len(args) == 1: res = math_ops.matmul(args[0], self._weights) else: # Explicitly creating a one for a minor performance improvement. one = constant_op.constant(1, dtype=dtypes.int32) res = math_ops.matmul(array_ops.concat(args, one), self._weights) if self._build_bias: res = nn_ops.bias_add(res, self._biases) return res
def call(self, inputs, state): """Long short-term memory cell (LSTM) with masks for pruning. Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * self.state_size]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one) gate_inputs = math_ops.matmul( array_ops.concat([inputs, h], 1), self._masked_kernel) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split( value=gate_inputs, num_or_size_splits=4, axis=one) forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) # Note that using `add` and `multiply` instead of `+` and `*` gives a # performance improvement. So using those at the cost of readability. add = math_ops.add multiply = math_ops.multiply new_c = add( multiply(c, sigmoid(add(f, forget_bias_tensor))), multiply(sigmoid(i), self._activation(j))) new_h = multiply(self._activation(new_c), sigmoid(o)) if self._state_is_tuple: new_state = tf_rnn.LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def xw_plus_b(x, weights, biases, name=None): """Computes matmul(x, weights) + biases. Args: x: a 2D tensor. Dimensions typically: batch, in_units weights: a 2D tensor. Dimensions typically: in_units, out_units biases: a 1D tensor. Dimensions: out_units name: A name for the operation (optional). If not specified "wx_plus_b" is used. Returns: A 2-D Tensor computing matmul(x, weights) + biases. Dimensions typically: batch, out_units. """ with ops.op_scope([x, weights, biases], name, "xw_plus_b") as name: x = ops.convert_to_tensor(x, name="x") weights = ops.convert_to_tensor(weights, name="weights") biases = ops.convert_to_tensor(biases, name="biases") mm = math_ops.matmul(x, weights) return nn_ops.bias_add(mm, biases, name=name)
def relu_layer(x, weights, biases, name=None): """Computes Relu(x * weight + biases). Args: x: a 2D tensor. Dimensions typically: batch, in_units weights: a 2D tensor. Dimensions typically: in_units, out_units biases: a 1D tensor. Dimensions: out_units name: A name for the operation (optional). If not specified "nn_relu_layer" is used. Returns: A 2-D Tensor computing relu(matmul(x, weights) + biases). Dimensions typically: batch, out_units. """ with ops.op_scope([x, weights, biases], name, "relu_layer") as name: x = ops.convert_to_tensor(x, name="x") weights = ops.convert_to_tensor(weights, name="weights") biases = ops.convert_to_tensor(biases, name="biases") xw_plus_b = nn_ops.bias_add(math_ops.matmul(x, weights), biases) return nn_ops.relu(xw_plus_b, name=name)
def __call__(self, inputs, state, scope=None): num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev,m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): concat_w = tf.nn.rnn_cell._get_concat_variable( "W", [input_size.value + num_proj, 3 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[3 * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) cell_inputs = array_ops.concat(1,[inputs, m_prev]) ltm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i,j,o = array_ops.split(1,3,ltm_matrix) # i,j,o: [1,num_units] c = c_prev + sigmoid(i)*self._activation(j) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = tf.nn.rnn_cell._get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) if self._proj_clip is not None: m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) new_state = (tf.nn.rnn_cell.LSTMStateTuple(c,m) if self._state_is_tuple else array_ops.concat(1,[c,m])) return m, new_state
def call(self, inputs, state): """Most basic RNN: output = new_state = act(W * input + U * state + B).""" inputs = self._tflite_wrapper.add_input( inputs, tag="input", name="input", aggregate="stack", index_override=0) state = self._tflite_wrapper.add_input( state, tag="hidden_state", name="hidden_state", aggregate="first", index_override=4) weights = array_ops.transpose( array_ops.concat([self._input_weights, self._recurrent_weights], 1)) gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) output = self._activation(gate_inputs) output = self._tflite_wrapper.add_output( output, tag="output", name="output", index_override=1, aggregate="stack") return output, output
def _SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel, padding, strides, side_input_scale, side_input, biases, apply_relu): """Simulates the int8 fused 2-D convolution op using separate float ops. The arguments and return values have the same format, meanings and restrictions as the actual op. Args: conv_input_scale: A scalar 'float'. conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout. padding: A `string` from: `"SAME", "VALID"`. strides: A list of `ints`. side_input_scale: A scalar 'float'. side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout. biases: A `Tensor` of type `float32` in NCHW layout. apply_relu: A boolean to specify whether to apply "Relu" activation function that clips outputs to the range [0, 127], or "None" activation that clips to the range [-128, 127]. Returns: A `Tensor` of type `qint8` in NCHW_VECT_C layout. """ conv_result = nn_ops.conv2d( _NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)), _OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)), strides=strides, padding=padding, data_format="NCHW") * conv_input_scale conv_and_side_inputs = conv_result + side_input_scale * _NchwVectCToNchw( gen_array_ops.dequantize(side_input, -128, 127)) output = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW") if apply_relu: output = nn_ops.relu(output) result, _, _ = gen_array_ops.quantize_v2( _NchwToNchwVectC(output), -128, 127, dtypes.qint8) return result
def _test_fully_connected(tensor_in_sizes, filter_in_sizes, bias_in_size=None): """ One iteration of fully connected """ total_size_1 = 1 total_size_2 = 1 for s in tensor_in_sizes: total_size_1 *= s for s in filter_in_sizes: total_size_2 *= s # Initializes the input tensor with array containing incrementing # numbers from 1. data_array = [f * 1.0 for f in range(1, total_size_1 + 1)] filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)] assert int(total_size_1 / tensor_in_sizes[0]) == filter_in_sizes[0], \ "input size and filter size are mismatched" with tf.Graph().as_default(): in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32') in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32') # reshape N H W C into N H*W*C in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1]) out = math_ops.mat_mul(in_data_reshape, in_filter) # if we have bias if bias_in_size: assert bias_in_size[0] == filter_in_sizes[1], "bias and filter size are mismatched" bias_array = [f * 1.0 for f in range(1, bias_in_size[0] + 1)] in_bias = constant_op.constant(bias_array, shape=bias_in_size, dtype='float32') out = nn_ops.bias_add(out, in_bias) tflite_data_array = np.reshape(data_array, tensor_in_sizes).astype('float32') tvm_data_array = np.transpose(tflite_data_array, axes=(0, 3, 1, 2)) compare_tflite_with_tvm(tflite_data_array, tvm_data_array, 'Placeholder:0', [in_data], [out])
def call(self, inputs, state): """Run one time step of the IndRNN. Calculates the output and new hidden state using the IndRNN equation `output = new_state = act(W * input + u (*) state + b)` where `*` is the matrix multiplication and `(*)` is the Hadamard product. Args: inputs: Tensor, 2-D tensor of shape `[batch, num_units]`. state: Tensor, 2-D tensor of shape `[batch, num_units]` containing the previous hidden state. Returns: A tuple containing the output and new hidden state. Both are the same 2-D tensor of shape `[batch, num_units]`. """ gate_inputs = math_ops.matmul(inputs, self._input_kernel) recurrent_update = math_ops.multiply(state, self._recurrent_kernel) gate_inputs = math_ops.add(gate_inputs, recurrent_update) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) output = self._activation(gate_inputs) return output, output
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, `[batch, num_units]. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, [batch, state_size]`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch, output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = math_ops.matmul( array_ops.concat([inputs, m_prev], 1), self._kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def call(self, inputs, state, training=False): num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") # i = input_gate, j = new_input, f = forget_gate, o = output_gate if not self._normalize_in_to_hidden or self._normalize_in_together: lstm_matrix = math_ops.matmul( array_ops.concat([inputs, m_prev], 1), self._kernel) if self._normalize_in_to_hidden: lstm_matrix = self._bn(lstm_matrix, training=training) else: op_i = math_ops.matmul(inputs, self._kernel_i) op_m = math_ops.matmul(m_prev, self._kernel_m) lstm_matrix = self._bn_i(op_i, training=training) lstm_matrix += self._bn_m(op_m, training=training) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if not self._normalize_cell: c_new = c else: c_new = self._bn_c(c, training=training) if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c_new) * self._activation(c_new) else: m = sigmoid(o) * self._activation(c_new) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def call(self, inputs, state): """Long short-term memory cell (Neat). Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, num_units]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * num_units]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid zero = constant_op.constant(0, dtype=dtypes.int32) one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one, name="c_h_-_split") # print("c = \n{}\nh = \n{}\n".format(c.get_shape(),h.get_shape())) # print("i = \n{}\n".format(inputs.get_shape())) input_depth = int(inputs.get_shape()[1]) shape = int(self._kernel.get_shape()[1]) ratio = [self._num_units * 5, self._num_units * 3] # print("w = \n{}\n".format(self._kernel.get_shape())) # W_fi [5,4] W_fh [5,28] W_f, W_r = array_ops.split(value=self._kernel, num_or_size_splits=ratio, axis=one, name="W-f_W-r_-_split_-kernel") # print("w_f = \n{}\nw_r = \n{}\n".format(W_f.get_shape(),W_r.get_shape())) # W_fi [1,4] W_fh [4,4] W_fi, W_fh = array_ops.split( value=W_f, num_or_size_splits=[input_depth, self._num_units], axis=zero, name="W-fi_W-fh_-_split_W-f") # print("w_fi = \n{}\nw_fh = \n{}\n".format(W_fi.get_shape(),W_fh.get_shape())) #print("b = \n{}\n".format(self._bias.get_shape())) # b_f [_num_units,] b_f [_num_units*7,] b_f, b_r = array_ops.split(value=self._bias, num_or_size_splits=ratio, axis=zero, name="b-f_b-r_-_split_-bias") # print("b_f = \n{}\nb_r = \n{}\n".format(b_f.get_shape(),b_r.get_shape())) # a [?,_num_units] sw = math_ops.add(math_ops.matmul(h, W_fh), math_ops.matmul(inputs, W_fi)) # print("a = \n{}\n".format(a.get_shape())) sw = nn_ops.bias_add(value=sw, bias=b_f) # print("a = \n{}\n".format(a.get_shape())) s, t, u, v, w = array_ops.split(value=sw, num_or_size_splits=5, axis=one, name="s_t_v_u_w_-_split_sw") # W_ri [input_depth,_num_units*7] W_rh [_num_units,_num_units*7] W_ri, W_rh = array_ops.split( value=W_r, num_or_size_splits=[input_depth, self._num_units], axis=zero, name="W-ri_W-rh_-_split_W-r") # print("w_ri = \n{}\nw_rh = \n{}\n".format(W_ri.get_shape(),W_rh.get_shape())) # bh [?,_num_units*7] xz = gen_math_ops.maximum(math_ops.matmul(h, W_rh), math_ops.matmul(inputs, W_ri)) # print("bh = \n{}\n".format(bh.get_shape())) xz = nn_ops.bias_add(xz, b_r) # print("bh = \n{}\n".format(bh.get_shape())) # b,...,h [?,_num_units] x, y, z = array_ops.split(value=xz, num_or_size_splits=3, axis=one, name="x_y_z_-_split_xz") add = math_ops.add multiply = math_ops.multiply tanh = math_ops.tanh relu = nn_ops.relu identity = array_ops.identity #Nas cell 2 new_c = multiply(identity(add(identity(add(c, tanh(z))), identity(y))), sigmoid(add(relu(v), tanh(s)))) new_h = tanh( multiply( identity(new_c), sigmoid( multiply(sigmoid(add(tanh(x), tanh(w))), sigmoid(add(identity(u), tanh(t))))))) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype actual_input_size = inputs.get_shape().as_list()[1] if self._input_size and self._input_size != actual_input_size: raise ValueError( "Actual input size not same as specified: %d vs %d." % actual_input_size, self._input_size) with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable("B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add( math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable("W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable("W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable("W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) return m, array_ops.concat(1, [c, m])
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is num_units. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ freq_inputs = self._make_tf_features(inputs) dtype = inputs.dtype actual_input_size = freq_inputs[0].get_shape().as_list()[1] with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "GridLSTMCell" concat_w_f = _get_concat_variable( "W_f", [actual_input_size + 2*self._num_units, 4 * self._num_units], dtype, self._num_unit_shards) b_f = vs.get_variable( "B_f", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) if not self._share_time_frequency_weights: concat_w_t = _get_concat_variable( "W_t", [actual_input_size + 2*self._num_units, 4 * self._num_units], dtype, self._num_unit_shards) b_t = vs.get_variable( "B_t", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) if self._use_peepholes: # Diagonal connections w_f_diag_freqf = vs.get_variable( "W_F_diag_freqf", shape=[self._num_units], dtype=dtype) w_i_diag_freqf = vs.get_variable( "W_I_diag_freqf", shape=[self._num_units], dtype=dtype) w_o_diag_freqf = vs.get_variable( "W_O_diag_freqf", shape=[self._num_units], dtype=dtype) w_f_diag_freqt = vs.get_variable( "W_F_diag_freqt", shape=[self._num_units], dtype=dtype) w_i_diag_freqt = vs.get_variable( "W_I_diag_freqt", shape=[self._num_units], dtype=dtype) w_o_diag_freqt = vs.get_variable( "W_O_diag_freqt", shape=[self._num_units], dtype=dtype) if not self._share_time_frequency_weights: w_f_diag_timef = vs.get_variable( "W_F_diag_timef", shape=[self._num_units], dtype=dtype) w_i_diag_timef = vs.get_variable( "W_I_diag_timef", shape=[self._num_units], dtype=dtype) w_o_diag_timef = vs.get_variable( "W_O_diag_timef", shape=[self._num_units], dtype=dtype) w_f_diag_timet = vs.get_variable( "W_F_diag_timet", shape=[self._num_units], dtype=dtype) w_i_diag_timet = vs.get_variable( "W_I_diag_timet", shape=[self._num_units], dtype=dtype) w_o_diag_timet = vs.get_variable( "W_O_diag_timet", shape=[self._num_units], dtype=dtype) # initialize the first freq state to be zero m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units], dtype) c_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units], dtype) for freq_index in range(len(freq_inputs)): c_prev_time = array_ops.slice(state, [0, 2 * freq_index * self._num_units], [-1, self._num_units]) m_prev_time = array_ops.slice(state, [0, (2 * freq_index + 1) * self._num_units], [-1, self._num_units]) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [freq_inputs[freq_index], m_prev_time, m_prev_freq]) # F-LSTM lstm_matrix_freq = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w_f), b_f) i_freq, j_freq, f_freq, o_freq = array_ops.split(1, 4, lstm_matrix_freq) # T-LSTM if self._share_time_frequency_weights: i_time = i_freq j_time = j_freq f_time = f_freq o_time = o_freq else: lstm_matrix_time = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w_t), b_t) i_time, j_time, f_time, o_time = array_ops.split(1, 4, lstm_matrix_time) # F-LSTM c_freq if self._use_peepholes: c_freq = (sigmoid(f_freq + self._forget_bias + w_f_diag_freqf * ( c_prev_freq) + w_f_diag_freqt * c_prev_time) * c_prev_freq + sigmoid(i_freq + w_i_diag_freqf * c_prev_freq + ( w_i_diag_freqt * c_prev_time)) * tanh(j_freq)) else: c_freq = (sigmoid(f_freq + self._forget_bias) * c_prev_freq + sigmoid(i_freq) * tanh(j_freq)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c_freq = clip_ops.clip_by_value(c_freq, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type # T-LSTM c_freq if self._use_peepholes: if self._share_time_frequency_weights: c_time = sigmoid(f_time + self._forget_bias + w_f_diag_freqf * ( c_prev_freq + w_f_diag_freqt * c_prev_time)) * c_prev_time + ( sigmoid(i_time + w_i_diag_freqf * c_prev_freq + ( w_i_diag_freqt * c_prev_time)) * tanh(j_time)) else: c_time = sigmoid(f_time + self._forget_bias + w_f_diag_timef * ( c_prev_time + w_f_diag_timet * c_prev_time)) * c_prev_time + ( sigmoid(i_time + w_i_diag_timef * c_prev_freq + ( w_i_diag_timet * c_prev_time)) * tanh(j_time)) else: c_time = (sigmoid(f_time + self._forget_bias) * c_prev_time + sigmoid(i_time) * tanh(j_time)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c_time = clip_ops.clip_by_value(c_freq, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type # F-LSTM m_freq if self._use_peepholes: m_freq = sigmoid(o_freq + w_o_diag_freqf * c_freq + w_o_diag_freqt * c_time) * tanh(c_freq) else: m_freq = sigmoid(o_freq) * tanh(c_freq) # T-LSTM m_time if self._use_peepholes: if self._share_time_frequency_weights: m_time = sigmoid(o_time + w_o_diag_freqf * c_freq + w_o_diag_freqt * c_time) * tanh(c_time) else: m_time = sigmoid(o_time + w_o_diag_timef * c_freq + w_o_diag_timet * c_time) * tanh(c_time) else: m_time = sigmoid(o_time) * tanh(c_time) m_prev_freq = m_freq c_prev_freq = c_freq # Concatenate the outputs for T-LSTM and F-LSTM for each shift if freq_index == 0: state_out = array_ops.concat(1, [c_time, m_time]) m_out = array_ops.concat(1, [m_time, m_freq]) else: state_out = array_ops.concat(1, [state_out, c_time, m_time]) m_out = array_ops.concat(1, [m_out, m_time, m_freq]) return m_out, state_out
def _recurrence(self, inputs, hidden_state, cell_states, depth): """use recurrence to traverse the nested structure Args: inputs: A 2D `Tensor` of [batch_size x input_size] shape. hidden_state: A 2D `Tensor` of [batch_size x num_units] shape. cell_states: A `list` of 2D `Tensor` of [batch_size x num_units] shape. depth: `int` the current depth in the nested structure, begins at 0. Returns: new_h: A 2D `Tensor` of [batch_size x num_units] shape. the latest hidden state for current step. new_cs: A `list` of 2D `Tensor` of [batch_size x num_units] shape. The accumulated cell states for current step. """ sigmoid = math_ops.sigmoid one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. c = cell_states[depth] h = hidden_state gate_inputs = math_ops.matmul(array_ops.concat([inputs, h], 1), self._kernels[depth]) if self._use_bias: gate_inputs = nn_ops.bias_add(gate_inputs, self._biases[depth]) if self._use_peepholes: peep_gate_inputs = math_ops.matmul(c, self._peep_kernels[depth]) i_peep, f_peep, o_peep = array_ops.split(value=peep_gate_inputs, num_or_size_splits=3, axis=one) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one) if self._use_peepholes: i += i_peep f += f_peep o += o_peep if self._use_peepholes: peep_gate_inputs = math_ops.matmul(c, self._peep_kernels[depth]) i_peep, f_peep, o_peep = array_ops.split(value=peep_gate_inputs, num_or_size_splits=3, axis=one) i += i_peep f += f_peep o += o_peep # Note that using `add` and `multiply` instead of `+` and `*` gives a # performance improvement. So using those at the cost of readability. add = math_ops.add multiply = math_ops.multiply if self._use_bias: forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype) f = add(f, forget_bias_tensor) inner_hidden = multiply(c, self._gate_activation(f)) if depth == 0: inner_input = multiply(self._gate_activation(i), self._cell_activation(j)) else: inner_input = multiply(self._gate_activation(i), self._activation(j)) if depth == (self.depth - 1): new_c = add(inner_hidden, inner_input) new_cs = [new_c] else: new_c, new_cs = self._recurrence(inputs=inner_input, hidden_state=inner_hidden, cell_states=cell_states, depth=depth + 1) new_h = multiply(self._activation(new_c), self._gate_activation(o)) new_cs = [new_h] + new_cs return new_h, new_cs
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is num_units. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ sigmoid = math_ops.sigmoid tanh = math_ops.tanh num_gates = 3 if self._couple_input_forget_gates else 4 freq_inputs = self._make_tf_features(inputs) dtype = inputs.dtype actual_input_size = freq_inputs[0].get_shape().as_list()[1] with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "GridLSTMCell" concat_w_f = _get_concat_variable( "W_f", [actual_input_size + 2 * self._num_units, num_gates * self._num_units], dtype, self._num_unit_shards) b_f = vs.get_variable( "B_f", shape=[num_gates * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) if not self._share_time_frequency_weights: concat_w_t = _get_concat_variable( "W_t", [actual_input_size + 2 * self._num_units, num_gates * self._num_units], dtype, self._num_unit_shards) b_t = vs.get_variable( "B_t", shape=[num_gates * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) if self._use_peepholes: # Diagonal connections if not self._couple_input_forget_gates: w_f_diag_freqf = vs.get_variable( "W_F_diag_freqf", shape=[self._num_units], dtype=dtype) w_f_diag_freqt = vs.get_variable( "W_F_diag_freqt", shape=[self._num_units], dtype=dtype) w_i_diag_freqf = vs.get_variable( "W_I_diag_freqf", shape=[self._num_units], dtype=dtype) w_i_diag_freqt = vs.get_variable( "W_I_diag_freqt", shape=[self._num_units], dtype=dtype) w_o_diag_freqf = vs.get_variable( "W_O_diag_freqf", shape=[self._num_units], dtype=dtype) w_o_diag_freqt = vs.get_variable( "W_O_diag_freqt", shape=[self._num_units], dtype=dtype) if not self._share_time_frequency_weights: if not self._couple_input_forget_gates: w_f_diag_timef = vs.get_variable( "W_F_diag_timef", shape=[self._num_units], dtype=dtype) w_f_diag_timet = vs.get_variable( "W_F_diag_timet", shape=[self._num_units], dtype=dtype) w_i_diag_timef = vs.get_variable( "W_I_diag_timef", shape=[self._num_units], dtype=dtype) w_i_diag_timet = vs.get_variable( "W_I_diag_timet", shape=[self._num_units], dtype=dtype) w_o_diag_timef = vs.get_variable( "W_O_diag_timef", shape=[self._num_units], dtype=dtype) w_o_diag_timet = vs.get_variable( "W_O_diag_timet", shape=[self._num_units], dtype=dtype) # initialize the first freq state to be zero m_prev_freq = array_ops.zeros( [int(inputs.get_shape()[0]), self._num_units], dtype) c_prev_freq = array_ops.zeros( [int(inputs.get_shape()[0]), self._num_units], dtype) for freq_index in range(len(freq_inputs)): if self._state_is_tuple: name_prefix = "state_f%02d" % freq_index c_prev_time = getattr(state, name_prefix + "_c") m_prev_time = getattr(state, name_prefix + "_m") else: c_prev_time = array_ops.slice( state, [0, 2 * freq_index * self._num_units], [-1, self._num_units]) m_prev_time = array_ops.slice( state, [0, (2 * freq_index + 1) * self._num_units], [-1, self._num_units]) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [freq_inputs[freq_index], m_prev_time, m_prev_freq]) # F-LSTM lstm_matrix_freq = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w_f), b_f) if self._couple_input_forget_gates: i_freq, j_freq, o_freq = array_ops.split(1, num_gates, lstm_matrix_freq) f_freq = None else: i_freq, j_freq, f_freq, o_freq = array_ops.split(1, num_gates, lstm_matrix_freq) # T-LSTM if self._share_time_frequency_weights: i_time = i_freq j_time = j_freq f_time = f_freq o_time = o_freq else: lstm_matrix_time = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w_t), b_t) if self._couple_input_forget_gates: i_time, j_time, o_time = array_ops.split(1, num_gates, lstm_matrix_time) f_time = None else: i_time, j_time, f_time, o_time = array_ops.split(1, 4, lstm_matrix_time) # F-LSTM c_freq # input gate activations if self._use_peepholes: i_freq_g = sigmoid(i_freq + w_i_diag_freqf * c_prev_freq + w_i_diag_freqt * c_prev_time) else: i_freq_g = sigmoid(i_freq) # forget gate activations if self._couple_input_forget_gates: f_freq_g = 1.0 - i_freq_g else: if self._use_peepholes: f_freq_g = sigmoid(f_freq + self._forget_bias + w_f_diag_freqf * c_prev_freq + w_f_diag_freqt * c_prev_time) else: f_freq_g = sigmoid(f_freq + self._forget_bias) # cell state c_freq = f_freq_g * c_prev_freq + i_freq_g * tanh(j_freq) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c_freq = clip_ops.clip_by_value(c_freq, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type # T-LSTM c_freq # input gate activations if self._use_peepholes: if self._share_time_frequency_weights: i_time_g = sigmoid(i_time + w_i_diag_freqf * c_prev_freq + w_i_diag_freqt * c_prev_time) else: i_time_g = sigmoid(i_time + w_i_diag_timef * c_prev_freq + w_i_diag_timet * c_prev_time) else: i_time_g = sigmoid(i_time) # forget gate activations if self._couple_input_forget_gates: f_time_g = 1.0 - i_time_g else: if self._use_peepholes: if self._share_time_frequency_weights: f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_freqf * c_prev_freq + w_f_diag_freqt * c_prev_time) else: f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_timef * c_prev_freq + w_f_diag_timet * c_prev_time) else: f_time_g = sigmoid(f_time + self._forget_bias) # cell state c_time = f_time_g * c_prev_time + i_time_g * tanh(j_time) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c_time = clip_ops.clip_by_value(c_time, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type # F-LSTM m_freq if self._use_peepholes: m_freq = sigmoid(o_freq + w_o_diag_freqf * c_freq + w_o_diag_freqt * c_time) * tanh(c_freq) else: m_freq = sigmoid(o_freq) * tanh(c_freq) # T-LSTM m_time if self._use_peepholes: if self._share_time_frequency_weights: m_time = sigmoid(o_time + w_o_diag_freqf * c_freq + w_o_diag_freqt * c_time) * tanh(c_time) else: m_time = sigmoid(o_time + w_o_diag_timef * c_freq + w_o_diag_timet * c_time) * tanh(c_time) else: m_time = sigmoid(o_time) * tanh(c_time) m_prev_freq = m_freq c_prev_freq = c_freq # Concatenate the outputs for T-LSTM and F-LSTM for each shift if freq_index == 0: state_out_lst = [c_time, m_time] m_out_lst = [m_time, m_freq] else: state_out_lst.extend([c_time, m_time]) m_out_lst.extend([m_time, m_freq]) if self._state_is_tuple: state_out = self._state_tuple_type(*state_out_lst) else: state_out = array_ops.concat(1, state_out_lst) # Outputs are always concated as it is never used separately. m_out = array_ops.concat(1, m_out_lst) return m_out, state_out
def _init_nn(self): # split the input based on divider x_splits = tf.split(self.x, self.x_divider, 2) # LSTM layer, share the variants(weights and bias) with tf.variable_scope("LSTM_layer", reuse=tf.AUTO_REUSE): # out put of LSTM layer. out_lstm_layer = [None] * len(x_splits) for idx_1st, part_1st in enumerate(x_splits): split_s = tf.split(part_1st, tf.ones(part_1st.shape[2], dtype='int32'), 2) # expand dimension for the reduce mean later out_cell_part_2nd = None for idx_2nd, part_2nd in enumerate(split_s): cell_part_2nd = self.add_rnn(1, self.hidden_size, self.keep_prob) out_cell, _ = tf.nn.dynamic_rnn(cell_part_2nd, part_2nd, dtype=tf.float32) if idx_2nd == 0: out_cell_part_2nd = tf.expand_dims(out_cell, 0) else: out_cell = tf.expand_dims(out_cell, 0) out_cell_part_2nd = tf.concat([out_cell_part_2nd, out_cell], 0) out_lstm_layer[idx_1st] = tf.reduce_mean(out_cell_part_2nd, 0) with tf.variable_scope("MI_RNN_layer"): mi_rnn_input = concat(out_lstm_layer, 2) rnn_mi_cells = self.add_rnn(self.layer_size, self.hidden_size, self.keep_prob, MultiInputLSTMCell) # out_mi, shape = (batch_size, seq_length, p) out_mi, _ = tf.nn.dynamic_rnn(rnn_mi_cells, mi_rnn_input, dtype=tf.float32) with tf.variable_scope("final_attn"): # output of multi input cell, shape (?, seq, p) # tf.scan(lambda a, x: tf.matmul(a=self._w_f_attn, b=x, transpose_b=True), out_mi) j = tf.scan(lambda a, x: tf.matmul(a=x, b=self._w_f_attn), out_mi) # shape (?, seq, p) # add bias TODO check the behavior of bias_add in two dimension case j = tf.scan(lambda a, x: nn_ops.bias_add(x, self._b_f_attn), j) # shape (?, seq, p) scan_init = tf.constant(np.zeros((j.shape[1], self._v_f_attn.shape[1])), dtype=tf.float32) # finally the shape of j is (?, seq, 1) j = tf.scan(lambda a, x: matmul(tanh(x), self._v_f_attn), j, initializer=scan_init) # beta = tf.nn.softmax(j, axis=1) # beta = tf.reshape(beta, [beta.shape[1], beta.shape[2]]) beta = tf.scan(lambda a, x: tf.nn.softmax(x, axis=1), j) # tf.scan(lambda a, x: tf.nn.softmax(x, axis=1), j) # shape of beta: (?,seq_step,1) # shape of out_mi: (?,seq_step, p) scan_init = tf.constant(np.zeros((beta.shape[2], out_mi.shape[2])), dtype=tf.float32) def f(a, x): (out_mi_x, beta_x) = x return matmul(a=beta_x, b=out_mi_x, transpose_a=True) y_tilde = tf.scan(f, (out_mi, beta), initializer=scan_init) with tf.variable_scope("activation"): scan_init = tf.constant(np.zeros((1, 1)), dtype=tf.float32) act_input = tf.scan(lambda a, x: matmul(a=self._w_activation, b=x, transpose_b=True), y_tilde, initializer=scan_init) act_input = tf.reshape(act_input, [-1, 1]) act_input = nn_ops.bias_add(act_input, self._b_activation) self.y = relu(act_input)
def testDeterministicGradients(self, data_layout, data_rank, data_type): with self.session(force_gpu=True): # Using a cached_session with force_gpu=True does not work at the time # of writing (2019-12-10). Before the @parameterized.named_parameters # decorator was added, this non-cached session context was set outside # the iteration loops for the parameter combinations, and so was re-used. seed = (hash(data_layout) % 256 + hash(data_rank) % 256 + hash(data_type) % 256) np.random.seed(seed) batch_size = 10 channel_count = 8 data_dim = 14 input_shape = self._makeShapeTuple(batch_size, channel_count, data_rank, data_dim, data_layout) bias_shape = (channel_count, ) output_shape = input_shape input_val = self._randomDataOp(input_shape, data_type) bias_val = self._randomDataOp(bias_shape, data_type) data_format = self._dataFormatFromDataLayout(data_layout) repeat_count = 5 if context.executing_eagerly(): def bias_gradients(local_seed): np.random.seed(local_seed) upstream_gradients = self._randomDataOp( output_shape, data_type) with backprop.GradientTape(persistent=True) as tape: tape.watch(bias_val) bias_add_output = nn_ops.bias_add( input_val, bias_val, data_format=data_format) gradient_injector_output = bias_add_output * upstream_gradients return tape.gradient(gradient_injector_output, bias_val) for i in range(repeat_count): local_seed = seed + i # select different upstream gradients result_a = bias_gradients(local_seed) result_b = bias_gradients(local_seed) self.assertAllEqual(result_a, result_b) else: # graph mode upstream_gradients = array_ops.placeholder( data_type, shape=output_shape, name='upstream_gradients') bias_add_output = nn_ops.bias_add(input_val, bias_val, data_format=data_format) gradient_injector_output = bias_add_output * upstream_gradients # The gradient function behaves as if grad_ys is multiplied by the op # gradient result, not passing the upstram gradients through the op's # gradient generation graph. This is the reason for using the # gradient injector bias_gradients = gradients_impl.gradients( gradient_injector_output, bias_val, grad_ys=None, colocate_gradients_with_ops=True)[0] for i in range(repeat_count): feed_dict = { upstream_gradients: self._randomNDArray(output_shape) } result_a = bias_gradients.eval(feed_dict=feed_dict) result_b = bias_gradients.eval(feed_dict=feed_dict) self.assertAllEqual(result_a, result_b)
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, must be 2-D, `[batch, input_size]`. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, [batch, state_size]`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch, output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") # No feedback, if desired; also, gcnn/cnn do not have feedback if self._no_feedback or self._gate_mod in ["gcnn", "cnn"]: m_prev = tf.zeros(m_prev.shape) # i = input_gate, j = new_input, f = forget_gate, o = output_gate if self._ngram: lstm_matrix = inputs + math_ops.matmul(m_prev, self._kernel) else: lstm_matrix = math_ops.matmul( array_ops.concat([inputs, m_prev], 1), self._kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) elif self._gate_mod == "lstm": c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) elif self._gate_mod == "rkm_lstm": c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * j) elif self._gate_mod == "rkm_cifg": c = (sigmoid(f + self._forget_bias) * c_prev + (1 - sigmoid(f + self._forget_bias)) * j) elif self._gate_mod in ["gated_linear", "linear"]: # sigma2_f = 0.5 # sigma2_i = 0.5 # c = (sigma2_f * c_prev + sigma2_i * j) c = (self._sigma2_f * c_prev + self._sigma2_i * j) elif self._gate_mod in ["gcnn", "cnn"]: sigma2_i = 1 c = sigma2_i * j else: raise NotImplementedError("Invalid gate_mod: {0}".format( self._gate_mod)) if self._layer_norm: c = tf.contrib.layers.layer_norm(c) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) elif self._gate_mod == "lstm": m = sigmoid(o) * self._activation(c) elif self._gate_mod in [ "rkm_lstm", "rkm_cifg", "gated_linear", "gcnn" ]: m = sigmoid(o) * c elif self._gate_mod in ["linear", "cnn"]: m = self._activation(c) else: raise NotImplementedError("Invalid gate_mod: {0}".format( self._gate_mod)) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias, strides, padding, activation_mode, data_format, filter_format, dtype): """Verifies the output values of the convolution function. Args: tensor_in_sizes: Input tensor dimensions in [batch, input_rows, input_cols, input_depth]. filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols, input_depth, output_depth]. bias: 1-D bias tensor of length output_depth. strides: Stride: [col_stride, row_stride] padding: Padding type. activation_mode: Activation mode. data_format: Format of the data tensors. filter_format: Filter format to use for the fused convolution. dtype: Data type for inputs and outputs. Returns: Symbolic tensor value and reference value that can be used to execute the computation and verify the results. """ input_size = np.prod(tensor_in_sizes) filter_size = np.prod(filter_in_sizes) bias_size = filter_in_sizes[-1] # equals to output depth # Initializes the input tensor with array containing incrementing # numbers from 1. x1 = [f * 1.0 for f in range(1, input_size + 1)] x2 = [f * 1.0 for f in range(1, filter_size + 1)] # This is to guarantee that there are always negative values after # bias add so that we can test whether relu works correctly. x3 = bias with self.cached_session(use_gpu=True), self.test_scope(): t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype) t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype) fused_t2 = t2 if filter_format == "OIHW": fused_t2 = _HwioToOihw(t2) t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype) strides = [1] + strides + [1] if data_format == "NCHW": t1 = test_util.NHWCToNCHW(t1) strides = test_util.NHWCToNCHW(strides) output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( t1, fused_t2, t3, strides=strides, padding=padding, data_format=data_format, filter_format=filter_format, activation_mode=activation_mode) ref_conv_output = nn_ops.conv2d(t1, t2, strides=strides, padding=padding, data_format=data_format) ref_bias_output = nn_ops.bias_add(ref_conv_output, t3, data_format=data_format) ref_output = nn_ops.relu(ref_bias_output) if data_format == "NCHW": output = test_util.NCHWToNHWC(output) ref_output = test_util.NCHWToNHWC(ref_output) return output, ref_output
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [input_size.value + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state
def call(self, inputs, state): """Run one step of the IndRNN. Calculates the output and new hidden state using the IndRNN equation `output = new_state = act(W * input + u (*) state + b)` , where `*` is the matrix multiplication and `(*)` is the Hadamard product. Args: inputs: Tensor, 2-dimensional tensor of shape `[batch, num_units]`. state: Tensor, 2-dimensional tensor of shape `[batch, num_units]` containing the previous hidden state. Returns: A tuple containing the output and new hidden state. Both are the same 2-dimensional tensor of shape `[batch, num_units]`. """ last_state = state[1] if self.topdown and self._layer_idx < 5: #the final layer does not need to clip print("hah") #self._input_kernel_top = [v for v in tf.global_variables() if v.name == "rnn/multi_rnn_cell/cell_"+str(self._layer_idx+1)+"/ind_rnn_cell/input_kernel:0"][0] #self._input_kernel_top = tf.get_variable("rnn/multi_rnn_cell/cell_"+str(self._layer_idx+1)+"/ind_rnn_cell/input_kernel:0") W_l2norm = math_ops.sqrt( math_ops.matmul(self._hierarchy_kernel1, self._input_kernel_top)) self._input_kernel_top = self._input_kernel_top * self._recurrent_max_abs / tf.maximum( self._recurrent_max_abs_tensor, W_l2norm) self._hierarchy_kernel1 = self._hierarchy_kernel1 * self._recurrent_max_abs / tf.maximum( self._recurrent_max_abs_tensor, W_l2norm) if self._layer_idx == 0: gate_inputs = math_ops.matmul(inputs, self._input_kernel) else: _input_kernel_last = [ v for v in tf.global_variables() if v.name == "rnn/multi_rnn_cell/cell_" + str(self._layer_idx - 1) + "/ind_rnn_cell/input_kernel_top:0" ][0] gate_inputs = math_ops.matmul(inputs, _input_kernel_last) is_training = True gate_inputs = batch_norm(gate_inputs, 'gate_inputs', is_training) recurrent_update = math_ops.multiply(state[0], self._recurrent_kernel) #recurrent_update = batch_norm(recurrent_update, 'recurrent_update', is_training) if self.topdown: #hierarchy_update = math_ops.multiply(last_state, self._hierarchy_kernel) #gate_inputs = math_ops.add(gate_inputs, hierarchy_update) #gate_inputs = math_ops.add(gate_inputs, last_state) hierarchy_update = math_ops.matmul(last_state, self._hierarchy_kernel1) hierarchy_update = batch_norm(hierarchy_update, 'hierarchy_update', is_training) gate_inputs = math_ops.add(gate_inputs, hierarchy_update) #recurrent_update = math_ops.add(recurrent_update, tf.tile(math_ops.reduce_mean(recurrent_update, 1, keep_dims=True), [1,128])) gate_inputs = math_ops.add(gate_inputs, recurrent_update) gate_inputs = nn_ops.bias_add(gate_inputs, self._bias) output = self._activation(gate_inputs) #output = batch_norm(output, 'output', is_training) #if self._batch_norm: # output = self.bn(output, training=self._in_training) return output, output
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype actual_input_size = inputs.get_shape().as_list()[1] if self._input_size and self._input_size != actual_input_size: raise ValueError("Actual input size not same as specified: %d vs %d." % actual_input_size, self._input_size) with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) return m, array_ops.concat(1, [c, m])
def _testBias(self, np_inputs, np_bias, use_gpu=False): np_val = self._npBias(np_inputs, np_bias) with self.cached_session(use_gpu=use_gpu): tf_val = nn_ops.bias_add(np_inputs, np_bias).eval() self.assertAllCloseAccordingToType(np_val, tf_val)
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "TimeFreqLSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is num_units. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ sigmoid = math_ops.sigmoid tanh = math_ops.tanh freq_inputs = self._make_tf_features(inputs) dtype = inputs.dtype actual_input_size = freq_inputs[0].get_shape().as_list()[1] with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "TimeFreqLSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + 2*self._num_units, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) # initialize the first freq state to be zero m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units], dtype) for fq in range(len(freq_inputs)): c_prev = array_ops.slice(state, [0, 2*fq*self._num_units], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units], [-1, self._num_units]) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [freq_inputs[fq], m_prev, m_prev_freq]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) m_prev_freq = m if fq == 0: state_out = array_ops.concat(1, [c, m]) m_out = m else: state_out = array_ops.concat(1, [state_out, c, m]) m_out = array_ops.concat(1, [m_out, m]) return m_out, state_out
def testInputDims(self): with self.assertRaises(ValueError): nn_ops.bias_add([1, 2], [1])
def joint_weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A restricted linear prediction builder based on FeatureColumns. As long as all feature columns are unweighted sparse columns this computes the prediction of a linear model which stores all weights in a single variable. Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A list of Variables storing the weights. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='joint_weighted_sum_from_feature_columns', values=columns_to_tensors.values()): transformer = _Transformer(columns_to_tensors) embedding_lookup_arguments = [] for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments.append( column._wide_embedding_lookup_arguments(transformed_tensor)) # pylint: disable=protected-access except NotImplementedError: raise NotImplementedError('Real-valued columns are not supported. ' 'Use weighted_sum_from_feature_columns ' 'instead, or bucketize these columns.') variable, predictions_no_bias = _create_joint_embedding_lookup( columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, variable, bias
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "TimeFreqLSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is num_units. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ freq_inputs = self._make_tf_features(inputs) dtype = inputs.dtype actual_input_size = freq_inputs[0].get_shape().as_list()[1] with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "TimeFreqLSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + 2*self._num_units, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) # initialize the first freq state to be zero m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units], dtype) for fq in range(len(freq_inputs)): c_prev = array_ops.slice(state, [0, 2*fq*self._num_units], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units], [-1, self._num_units]) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [freq_inputs[fq], m_prev, m_prev_freq]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) m_prev_freq = m if fq == 0: state_out = array_ops.concat(1, [c, m]) m_out = m else: state_out = array_ops.concat(1, [state_out, c, m]) m_out = array_ops.concat(1, [m_out, m]) return m_out, state_out
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layer style linear prediction builder based on FeatureColumns. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. Example: ``` # Building model for training feature_columns = ( real_valued_column("my_feature1"), ... ) columns_to_tensor = tf.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, labels) ``` Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = dict() transformer = _Transformer(columns_to_tensors) # pylint: disable=protected-access for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments = column._wide_embedding_lookup_arguments( transformed_tensor) variable, predictions = _create_embedding_lookup( column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) except NotImplementedError: with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): tensor = column._to_dense_tensor(transformed_tensor) tensor = fc._reshape_real_valued_tensor(tensor, 2, column.name) variable = [ contrib_variables.model_variable( name='weight', shape=[tensor.get_shape()[1], num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=weight_collections) ] predictions = math_ops.matmul(tensor, variable[0], name='matmul') except ValueError as ee: raise ValueError('Error creating weighted sum for column: {}.\n' '{}'.format(column.name, ee)) output_tensors.append(predictions) column_to_variable[column] = variable _log_variable(variable) _maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: enable=protected-access predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [input_size.value + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state
def loss_fn(): y = array_ops.reshape(nn_ops.bias_add( math_ops.matmul(x, kernel), bias), []) - constant_op.constant(1.) return y * y