def linear(args, output_size, bias, bias_start=0.0, init_constant_bias=False, initializer=None, scope=None, dtype=tf.float32): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. init_constant_bias: boolean. If False, the variable scope initializer will be used to initialize the bias parameter. If True, the bias Parameters will be initialized to a constant value as definied in bias_start. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ assert args is not None if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: {0}".format( str(shapes))) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: {0}".format( str(shapes))) else: total_arg_size += shape[1] # dtype = [a.dtype for a in args][0] # Now the computation. with tf.variable_scope(scope or "Linear"): # , reuse=reuse_variables): matrix = tf.get_variable("Matrix", [total_arg_size, output_size], dtype=dtype, initializer=initializer) if len(args) == 1: res = tf.matmul(args[0], matrix) else: res = tf.matmul(tf.concat(axis=1, values=args), matrix) if not bias: return res if init_constant_bias: init_bias = tf.constant_initializer(bias_start) else: init_bias = initializer bias_term = tf.get_variable("Bias", [output_size], dtype=dtype, initializer=init_bias) return res + bias_term
def __init__(self, cells, state_is_tuple=True): """Create a RNN cell composed sequentially of a number of RNNCells. Args: cells: list of RNNCells that will be composed in this order. state_is_tuple: If True, accepted and returned states are n-tuples, where `n = len(cells)`. If False, the states are all concatenated along the column axis. This latter behavior will soon be deprecated. Raises: ValueError: if cells is empty (not allowed), or at least one of the cells returns a state tuple but the flag `state_is_tuple` is `False`. """ super(MultiRNNCell, self).__init__() if not cells: raise ValueError("Must specify at least one cell for MultiRNNCell.") if not nest.is_sequence(cells): raise TypeError( "cells must be a list or tuple, but saw: %s." % cells) self._cells = cells self._state_is_tuple = state_is_tuple if not state_is_tuple: if any(nest.is_sequence(c.state_size) for c in self._cells): raise ValueError("Some cells return tuples of states, but the flag " "state_is_tuple is not set. State sizes are: %s" % str([c.state_size for c in self._cells]))
def _linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias; None by default. kernel_initializer: starting value to initialize the weight; None by default. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable( _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def resetstate(self): if nest.is_sequence(self.initial_state): if nest.is_sequence(self.initial_state[0]): state = tuple(tuple(is2.eval() for is2 in ist) for ist in self.initial_state) else: state = tuple(ist.eval() for ist in self.initial_state) else: state = self.initial_state.eval() return state
def sum_logits(args, mask=None, name=None): with tf.name_scope(name or "sum_logits"): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] rank = len(args[0].get_shape()) logits = sum(tf.reduce_sum(arg, rank-1) for arg in args) if mask is not None: logits = exp_mask(logits, mask) return logits
def linear(args, output_size, bias, bias_start=0.0, scope=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: (optional) Variable scope to create parameters in. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. with tf.variable_scope(scope) as outer_scope: weights = tf.get_variable( "weights", [total_arg_size, output_size], dtype=dtype) if len(args) == 1: res = tf.matmul(args[0], weights) else: res = tf.matmul(tf.concat(args, 1), weights) if not bias: return res with tf.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) biases = tf.get_variable( "biases", [output_size], dtype=dtype, initializer=tf.constant_initializer(bias_start, dtype=dtype)) return tf.nn.bias_add(res, biases)
def _linear(args, output_size, bias, bias_start=0.0, weights_init=None, trainable=True, restore=True, reuse=False, scope=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Arguments: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (is_sequence(args) and not args): raise ValueError("`args` must be specified") if not is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError( "Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError( "Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] # Now the computation. with tf.variable_scope(scope or "Linear", reuse=reuse): matrix = va.variable("Matrix", [total_arg_size, output_size], initializer=weights_init, trainable=trainable, restore=restore) if len(args) == 1: res = tf.matmul(args[0], matrix) else: res = tf.matmul(array_ops.concat(1, args), matrix) if not bias: return res bias_term = va.variable( "Bias", [output_size], initializer=tf.constant_initializer(bias_start), trainable=trainable, restore=restore) return res + bias_term
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, keep_prob=None, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("args must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if keep_prob is not None and is_train is not None: flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, keep_prob), lambda: arg) for arg in flat_args] with tf.variable_scope(scope or 'linear'): flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start)) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) return out
def testIsSequence(self): self.assertFalse(nest.is_sequence("1234")) self.assertTrue(nest.is_sequence([1, 3, [4, 5]])) self.assertTrue(nest.is_sequence(((7, 8), (5, 6)))) self.assertTrue(nest.is_sequence([])) self.assertTrue(nest.is_sequence({"a": 1, "b": 2})) self.assertFalse(nest.is_sequence(set([1, 2]))) ones = array_ops.ones([2, 3]) self.assertFalse(nest.is_sequence(ones)) self.assertFalse(nest.is_sequence(math_ops.tanh(ones))) self.assertFalse(nest.is_sequence(np.ones((4, 5))))
def __call__(self, inputs, state, scope=None): """Run the cell with bottom layer's attention copied to all upper layers.""" if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) with tf.variable_scope(scope or "multi_rnn_cell"): new_states = [] with tf.variable_scope("cell_0_attention"): attention_cell = self._cells[0] attention_state = state[0] cur_inp, new_attention_state = attention_cell(inputs, attention_state) new_states.append(new_attention_state) for i in range(1, len(self._cells)): with tf.variable_scope("cell_%d" % i): cell = self._cells[i] cur_state = state[i] if self.use_new_attention: cur_inp = tf.concat([cur_inp, new_attention_state.attention], -1) else: cur_inp = tf.concat([cur_inp, attention_state.attention], -1) cur_inp, new_state = cell(cur_inp, cur_state) new_states.append(new_state) return cur_inp, tuple(new_states)
def wrapped_body(loop_counter, *args): """Loop body augmented with counter update. Args: loop_counter: Loop counter which needs to be incremented in the body. *args: List of args args[:len_orig_loop_vars] - Args for the original loop body. args[len_orig_loop_vars:] - External captures of cond. These get passed through as is. Returns: A list of tensors the same length as args. """ # Convert the flow variables in `args` to TensorArrays. `args` should # already have the same structure as `orig_loop_vars` but currently there # is no nest.zip so we call `_pack_sequence_as` which flattens both # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays # and packs it into the structure of `orig_loop_vars`. outputs = body( *_pack_sequence_as(orig_loop_vars, args[:len_orig_loop_vars])) if not nest.is_sequence(outputs): outputs = [outputs] # Compare the structure of input and output of body converting the # top-level tuples to list to be compatible with legacy while_loop. nest.assert_same_structure(list(outputs), list(orig_loop_vars)) outputs = _tensor_array_to_flow(outputs) # Return the external_captures of cond_graph as is, i.e., treat them as # loop invariants. # TODO(srbs): Update lowering code to create _Enter nodes with # is_constant=True for inputs that are directly passed to outputs. return [loop_counter + 1] + list(outputs) + list( args[len_orig_loop_vars:])
def map_structure_with_atomic(is_atomic_fn, map_fn, nested): """Maps the atomic elements of a nested structure. Arguments: is_atomic_fn: A function that determines if an element of `nested` is atomic. map_fn: The function to apply to atomic elements of `nested`. nested: A nested structure. Returns: The nested structure, with atomic elements mapped according to `map_fn`. Raises: ValueError: If an element that is neither atomic nor a sequence is encountered. """ if is_atomic_fn(nested): return map_fn(nested) # Recursively convert. if not nest.is_sequence(nested): raise ValueError( 'Received non-atomic and non-sequence element: {}'.format(nested)) if nest._is_mapping(nested): values = [nested[k] for k in nest._sorted(nested)] else: values = nested mapped_values = [ map_structure_with_atomic(is_atomic_fn, map_fn, ele) for ele in values ] return nest._sequence_like(nested, mapped_values)
def __call__(self, inputs, state, scope=None): """Run this multi-layer cell on inputs, starting from state.""" with vs.variable_scope(scope or type(self).__name__): # "MultiRNNCell" cur_state_pos = 0 cur_inp = inputs new_states = [] for i, cell in enumerate(self._cells): with vs.variable_scope("Cell%d" % i): if self._state_is_tuple: if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) cur_state = state[i] else: # print("STATE",state) """ cur_state = array_ops.slice( state, [0, cur_state_pos], [-1, cell.state_size]) """ cur_state = array_ops.unpack(state)[i] # cur_state_pos += cell.state_size cur_inp, new_state = cell(cur_inp, cur_state) new_states.append(new_state) """ new_states = (tuple(new_states) if self._state_is_tuple else array_ops.concat(1, new_states)) """ new_states = array_ops.pack(new_states) return cur_inp, new_states
def __call__(self, inputs, state, scope=None): """Run this multi-layer cell on inputs, starting from state.""" with vs.variable_scope(scope or "multi_rnn_cell"): cur_state_pos = 0 cur_inp = inputs new_states = [] outputs = [] for i, cell in enumerate(self._cells): with vs.variable_scope("cell_%d" % i): if self._state_is_tuple: if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) cur_state = state[i] else: cur_state = array_ops.slice( state, [0, cur_state_pos], [-1, cell.state_size]) cur_state_pos += cell.state_size cur_inp, new_state = cell(cur_inp, cur_state) outputs.append(cur_inp) new_states.append(new_state) new_states = (tuple(new_states) if self._state_is_tuple else array_ops.concat_v2(new_states, 1)) return tuple(outputs), new_states
def _create(self, encoder_output, decoder_state_size, **kwargs): """ Creates decoder's initial RNN states according to `decoder_state_size`. Passes the final state of encoder to each layer in decoder. Args: encoder_output: An instance of `collections.namedtuple` from `Encoder.encode()`. decoder_state_size: RNN decoder state size. **kwargs: Returns: The decoder states with the structure determined by `decoder_state_size`. Raises: ValueError: if the structure of encoder RNN state does not have the same structure of decoder RNN state. """ batch_size = tf.shape(encoder_output.attention_length)[0] # of type LSTMStateTuple enc_final_state = _final_state( encoder_output.final_states, direction=self.params["direction"]) assert_state_is_compatible(rnn_cell_impl._zero_state_tensors( decoder_state_size[0], batch_size, tf.float32), enc_final_state) if nest.is_sequence(decoder_state_size): return tuple([enc_final_state for _ in decoder_state_size]) return enc_final_state
def _infer_state_dtype(explicit_dtype, state): """Infer the dtype of an RNN state. Args: explicit_dtype: explicitly declared dtype or None. state: RNN's hidden state. Must be a Tensor or a nested iterable containing Tensors. Returns: dtype: inferred dtype of hidden state. Raises: ValueError: if `state` has heterogeneous dtypes or is empty. """ if explicit_dtype is not None: return explicit_dtype elif nest.is_sequence(state): inferred_dtypes = [element.dtype for element in nest.flatten(state)] if not inferred_dtypes: raise ValueError("Unable to infer dtype from empty state.") all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes]) if not all_same: raise ValueError( "State has tensors of different inferred_dtypes. Unable to infer a " "single representative dtype.") return inferred_dtypes[0] else: return state.dtype
def zero_state(self, batch_size, dtype): """Return zero-filled state tensor(s). Args: batch_size: int, float, or unit Tensor representing the batch size. dtype: the data type to use for the state. Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size x state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size x s]` for each s in `state_size`. """ state_size = self.state_size if nest.is_sequence(state_size): state_size_flat = nest.flatten(state_size) zeros_flat = [ array_ops.zeros( array_ops.pack(_state_size_with_prefix(s, prefix=[batch_size])), dtype=dtype) for s in state_size_flat] for s, z in zip(state_size_flat, zeros_flat): z.set_shape(_state_size_with_prefix(s, prefix=[None])) zeros = nest.pack_sequence_as(structure=state_size, flat_sequence=zeros_flat) else: zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size]) zeros = array_ops.zeros(array_ops.pack(zeros_size), dtype=dtype) zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None])) return zeros
def _tile_along_beam(cls, beam_size, state): if nest.is_sequence(state): return nest_map( lambda val: cls._tile_along_beam(beam_size, val), state ) if not isinstance(state, tf.Tensor): raise ValueError("State should be a sequence or tensor") tensor = state tensor_shape = tensor.get_shape().with_rank_at_least(1) try: new_first_dim = tensor_shape[0] * beam_size except: new_first_dim = None dynamic_tensor_shape = tf.unpack(tf.shape(tensor)) res = tf.expand_dims(tensor, 1) res = tf.tile(res, [1, beam_size] + [1] * (tensor_shape.ndims-1)) res = tf.reshape(res, [-1] + list(dynamic_tensor_shape[1:])) res.set_shape([new_first_dim] + list(tensor_shape[1:])) return res
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) # multiply with source mask, then do softmax if src_mask is not None: s = s * src_mask a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def wrapped_body(loop_counter, *args): """Loop body augmented with counter update. Args: loop_counter: Loop counter which needs to be incremented in the body. *args: List of args Returns: A list of tensors the same length as args. """ # Capture the tensors already captured in cond_graph so that they appear # in the same order in body_graph.external_captures. for t in cond_graph.external_captures: ops.get_default_graph().capture(t) # Convert the flow variables in `args` to TensorArrays. `args` should # already have the same structure as `orig_loop_vars` but currently there # is no nest.zip so we call `_pack_sequence_as` which flattens both # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays # and packs it into the structure of `orig_loop_vars`. outputs = body(*_pack_sequence_as(orig_loop_vars, args)) if not nest.is_sequence(outputs): outputs = [outputs] # Compare the structure of input and output of body converting the # top-level tuples to list to be compatible with legacy while_loop. nest.assert_same_structure(list(outputs), list(orig_loop_vars)) outputs = _tensor_array_to_flow(outputs) # TODO(srbs): Update lowering code to create _Enter nodes with # is_constant=True for inputs that are directly passed to outputs. return [loop_counter + 1] + list(outputs)
def attention(self, state): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(state): # If the query is a tuple, flatten it. # query_list = nest.flatten(state) # for q in query_list: # Check that ndims == 2 if specified. # ndims = q.get_shape().ndims # if ndims: # assert ndims == 2 # state = tf.concat(1, query_list) state = state[1] for a in xrange(self.num_heads): with tf.variable_scope("Attention_%d" % a, reuse=self.reuse_variables): y = tf.reshape(state, [-1, 1, 1, self.attn_vec_dim]) # Attention mask is a softmax of v^T * tanh(...). # s = tf.reduce_sum( # v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) # s = tf.reduce_sum( # self.v[a] * tf.mul(self.hidden_features[a], y), [2, 3]) s = tf.reduce_sum(tf.mul(self.hidden_features[a], y), [2, 3]) s = s - (1 - self.encoder_attn_masks) * 1e12 attn_mask = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum(tf.reshape(attn_mask, [-1, self.attn_length, 1, 1]) * self.hidden_features[a], [1, 2]) ds.append(tf.reshape(d, [-1, self.attn_dim])) attns = tf.concat(1, ds) attns.set_shape([None, self.num_heads * self.attn_dim]) self.attention_vars = True return attns, attn_mask
def match(self, expected, actual): """Matches nested structures. Recursively matches shape and values of `expected` and `actual`. Handles scalars, numpy arrays and other python sequence containers e.g. list, dict. Args: expected: Nested structure 1. actual: Nested structure 2. Raises: AssertionError if matching fails. """ if isinstance(expected, np.ndarray): expected = expected.tolist() if isinstance(actual, np.ndarray): actual = actual.tolist() self.assertEqual(type(expected), type(actual)) if nest.is_sequence(expected): self.assertEqual(len(expected), len(actual)) if isinstance(expected, dict): for key1, key2 in zip(sorted(expected), sorted(actual)): self.assertEqual(key1, key2) self.match(expected[key1], actual[key2]) else: for item1, item2 in zip(expected, actual): self.match(item1, item2) else: self.assertEqual(expected, actual)
def testIsSequence(self): self.assertFalse(nest.is_sequence("1234")) self.assertTrue(nest.is_sequence([1, 3, [4, 5]])) self.assertTrue(nest.is_sequence(((7, 8), (5, 6)))) self.assertTrue(nest.is_sequence([])) self.assertFalse(nest.is_sequence(set([1, 2]))) ones = tf.ones([2, 3]) self.assertFalse(nest.is_sequence(ones)) self.assertFalse(nest.is_sequence(tf.tanh(ones))) self.assertFalse(nest.is_sequence(np.ones((4, 5))))
def __init__(self, args, output_size, build_bias, bias_initializer=None, kernel_initializer=None): self._build_bias = build_bias if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] self._is_sequence = False else: self._is_sequence = True # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: self._weights = vs.get_variable( _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if build_bias: with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) self._biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer)
def _check_default_value(shape, default_value, dtype, key): """Returns default value as tuple if it's valid, otherwise raises errors. This function verifies that `default_value` is compatible with both `shape` and `dtype`. If it is not compatible, it raises an error. If it is compatible, it casts default_value to a tuple and returns it. `key` is used only for error message. Args: shape: An iterable of integers specifies the shape of the `Tensor`. default_value: If a single value is provided, the same value will be applied as the default value for every item. If an iterable of values is provided, the shape of the `default_value` should be equal to the given `shape`. dtype: defines the type of values. Default value is `tf.float32`. Must be a non-quantized, real integer or floating point type. key: A string providing key to look up corresponding `Tensor`. Returns: A tuple which will be used as default value. Raises: TypeError: if `default_value` is an iterable but not compatible with `shape` TypeError: if `default_value` is not compatible with `dtype`. ValueError: if `dtype` is not convertible to `tf.float32`. """ if default_value is None: return None if isinstance(default_value, int): return _create_tuple(shape, default_value) if isinstance(default_value, float) and dtype.is_floating: return _create_tuple(shape, default_value) if callable(getattr(default_value, 'tolist', None)): # Handles numpy arrays default_value = default_value.tolist() if nest.is_sequence(default_value): if not _is_shape_and_default_value_compatible(default_value, shape): raise ValueError( 'The shape of default_value must be equal to given shape. ' 'default_value: {}, shape: {}, key: {}'.format( default_value, shape, key)) # Check if the values in the list are all integers or are convertible to # floats. is_list_all_int = all( isinstance(v, int) for v in nest.flatten(default_value)) is_list_has_float = any( isinstance(v, float) for v in nest.flatten(default_value)) if is_list_all_int: return _as_tuple(default_value) if is_list_has_float and dtype.is_floating: return _as_tuple(default_value) raise TypeError('default_value must be compatible with dtype. ' 'default_value: {}, dtype: {}, key: {}'.format( default_value, dtype, key))
def wrap_state(self, state): dummy = BeamDecoderCellWrapper(None, self.num_classes, self.max_len, self.stop_token, self.beam_size) if nest.is_sequence(state): batch_size = tf.shape(nest.flatten(state)[0])[0] dtype = nest.flatten(state)[0].dtype else: batch_size = tf.shape(state)[0] dtype = state.dtype return dummy._create_state(batch_size, dtype, cell_state=state)
def _fwlinear(self, args, output_size, scope=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] assert len(args) == 2 assert args[0].get_shape().as_list()[1] == output_size dtype = [a.dtype for a in args][0] with vs.variable_scope(scope or "Linear"): # matrixW = vs.get_variable( # "MatrixW", dtype=dtype, initializer=tf.convert_to_tensor(np.eye(output_size, dtype=np.float32) * .05)) matrixW = vs.get_variable("MatrixW", [output_size, output_size], dtype=dtype) matrixC = vs.get_variable( "MatrixC", [args[1].get_shape().as_list()[1], output_size], dtype=dtype) res = tf.matmul(args[0], matrixW) + tf.matmul(args[1], matrixC) return res
def __init__(self, cells, state_is_tuple=True): if not cells: raise ValueError("Must specify at least one cell for MultiRNNCell.") self._cells = cells self._state_is_tuple = state_is_tuple if not state_is_tuple: if any(nest.is_sequence(c.state_size) for c in self._cells): raise ValueError("Some cells return tuples of states, but the flag " "state_is_tuple is not set. State sizes are: %s" % str([c.state_size for c in self._cells]))
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) if wd: add_wd(wd) return out
def wrap_state(self, state, output_projection): dummy = BeamDecoderCellWrapper(None, output_projection, self.num_classes, self.max_len, self.start_token, self.stop_token, self.batch_size, self.beam_size, self.use_attention, self.alpha) if nest.is_sequence(state): dtype = nest.flatten(state)[0].dtype else: dtype = state.dtype return dummy._create_state(self.batch_size, dtype, cell_state=state)
def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None): with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq") as scope: if dtype is not None: scope.set_dtype(dtype) else: dtype = scope.dtype # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) if isinstance(feed_previous, bool): return embedding_rnn_decoder( decoder_inputs, encoder_state, cell, num_decoder_symbols, embedding_size, output_projection=output_projection, feed_previous=feed_previous, scope=scope) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, cell, num_decoder_symbols, embedding_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def prune_extra_keys(narrow, wide): """Recursively prunes keys from `wide` if they don't appear in `narrow`. Often used as preprocessing prior to calling `tf.nest.flatten` or `tf.nest.map_structure`. This function is more forgiving than the ones in `nest`; if two substructures' types or structures don't agree, we consider it invalid and `prune_extra_keys` will return the `wide` substructure as is. Typically, additional checking is needed: you will also want to use `nest.assert_same_structure(narrow, prune_extra_keys(narrow, wide))` to ensure the result of pruning is still a correct structure. Examples: ```python wide = [{"a": "a", "b": "b"}] # Narrows 'wide' assert prune_extra_keys([{"a": 1}], wide) == [{"a": "a"}] # 'wide' lacks "c", is considered invalid. assert prune_extra_keys([{"c": 1}], wide) == wide # 'wide' contains a different type from 'narrow', is considered invalid assert prune_extra_keys("scalar", wide) == wide # 'wide' substructure for key "d" does not match the one in 'narrow' and # therefore is returned unmodified. assert (prune_extra_keys({"a": {"b": 1}, "d": None}, {"a": {"b": "b", "c": "c"}, "d": [1, 2]}) == {"a": {"b": "b"}, "d": [1, 2]}) ``` Args: narrow: A nested structure. wide: A nested structure that may contain dicts with more fields than `narrow`. Returns: A structure with the same nested substructures as `wide`, but with dicts whose entries are limited to the keys found in the associated substructures of `narrow`. In case of substructure or size mismatches, the returned substructures will be returned as is. Note that ObjectProxy-wrapped objects are considered equivalent to their non-ObjectProxy types. """ if isinstance(wide, wrapt.ObjectProxy): return type(wide)(prune_extra_keys(narrow, wide.__wrapped__)) narrow_raw = (narrow.__wrapped__ if isinstance(narrow, wrapt.ObjectProxy) else narrow) wide_raw = (wide.__wrapped__ if isinstance(wide, wrapt.ObjectProxy) else wide) if ((type(narrow_raw) != type(wide_raw)) # pylint: disable=unidiomatic-typecheck and not (isinstance(narrow_raw, list) and isinstance(wide_raw, list)) and not (isinstance(narrow_raw, collections_abc.Mapping) and isinstance(wide_raw, collections_abc.Mapping))): # We return early if the types are different; but we make some exceptions: # list subtypes are considered the same (e.g. ListWrapper and list()) # Mapping subtypes are considered the same (e.g. DictWrapper and dict()) # (TupleWrapper subtypes are handled by unwrapping ObjectProxy above) return wide if isinstance(narrow, collections_abc.Mapping): if len(narrow) > len(wide): # wide lacks a required key from narrow; return early. return wide narrow_keys = set(narrow.keys()) wide_keys = set(wide.keys()) if not wide_keys.issuperset(narrow_keys): # wide lacks a required key from narrow; return early. return wide ordered_items = [(k, prune_extra_keys(v, wide[k])) for k, v in narrow.items()] if isinstance(wide, collections.defaultdict): subset = type(wide)(wide.default_factory, ordered_items) else: subset = type(wide)(ordered_items) return subset if nest.is_sequence(narrow): if _is_attrs(wide): items = [ prune_extra_keys(n, w) for n, w in zip(_attr_items(narrow), _attr_items(wide)) ] return type(wide)(*items) # Not an attrs, so can treat as lists or tuples from here on. if len(narrow) != len(wide): # wide's size is different than narrow; return early. return wide items = [prune_extra_keys(n, w) for n, w in zip(narrow, wide)] if _is_namedtuple(wide): return type(wide)(*items) elif _is_attrs(wide): return type(wide) return type(wide)(items) # narrow is a leaf, just return wide return wide
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. The decoder output is over symbols from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it is over 0 to num_symbols - 1. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_symbols: Integer; number of symbols for both encoder and decoder. embedding_size: Integer, the length of the embedding vector for each symbol. num_decoder_symbols: Integer; number of output symbols for decoder. If provided, the decoder output is over symbols 0 to num_decoder_symbols - 1. Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that this assumes that the vocabulary is set up such that the first num_decoder_symbols of num_symbols are part of decoding. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_symbols] containing the generated outputs where output_symbols = num_decoder_symbols if num_decoder_symbols is not None otherwise output_symbols = num_symbols. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ with variable_scope.variable_scope( scope or "embedding_tied_rnn_seq2seq", dtype=dtype) as scope: dtype = scope.dtype if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) embedding = variable_scope.get_variable( "embedding", [num_symbols, embedding_size], dtype=dtype) emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs] output_symbols = num_symbols if num_decoder_symbols is not None: output_symbols = num_decoder_symbols if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, output_symbols) if isinstance(feed_previous, bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, True) if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): loop_function = _extract_argmax_and_embed( embedding, output_projection, False) if feed_previous_bool else None reuse = None if feed_previous_bool else True with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=reuse): outputs, state = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] # Calculate zero-state to know it's structure. static_batch_size = encoder_inputs[0].get_shape()[0] for inp in encoder_inputs[1:]: static_batch_size.merge_with(inp.get_shape()[0]) batch_size = static_batch_size.value if batch_size is None: batch_size = array_ops.shape(encoder_inputs[0])[0] zero_state = cell.zero_state(batch_size, dtype) if nest.is_sequence(zero_state): state = nest.pack_sequence_as(structure=zero_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def nest_map(func, nested): if not nest.is_sequence(nested): return func(nested) flat = nest.flatten(nested) return nest.pack_sequence_as(nested, list(map(func, flat)))
def attention(query): """ Put attention masks on hidden using hidden_features and query. :param query: Vector to compute attention with """ # Results of attention reads will be stored here. ds = [] # Will store masks over encoder context attn_masks = [] # Store attention logits attn_logits = [] # If the query is a tuple, flatten it. if nest.is_sequence(query): query_list = nest.flatten(query) # Check that ndims == 2 if specified. for q in query_list: ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): if attn_type == "linear": y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) elif attn_type == "bilinear": query = tf.tile(tf.expand_dims(query, 1), [1, attn_length, 1]) query = batch_linear(query, attn_size, bias=True) hid = tf.squeeze(hidden, [2]) s = tf.reduce_sum(tf.mul(query, hid), [2]) else: # Two layer MLP y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). layer1 = math_ops.tanh(hidden_features[a] + y) k2 = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) layer2 = nn_ops.conv2d(layer1, k2, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum(v[a] * math_ops.tanh(layer2), [2, 3]) a = nn_ops.softmax(s) attn_masks.append(a) attn_logits.append(s) # Now calculate the attention-weighted vector d. Hidden is encoder # hidden states d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, attn_masks, attn_logits
def __init__(self, input_tensor_spec, preprocessing_layers=None, preprocessing_combiner=None, conv_layer_params=None, fc_layer_params=None, dropout_layer_params=None, activation_fn=tf.keras.activations.relu, weight_decay_params=None, kernel_initializer=None, batch_squash=True, dtype=tf.float32, name='EncodingNetwork', conv_type=CONV_TYPE_2D): """Creates an instance of `EncodingNetwork`. Network supports calls with shape outer_rank + input_tensor_spec.shape. Note outer_rank must be at least 1. For example an input tensor spec with shape `(2, 3)` will require inputs with at least a batch size, the input shape is `(?, 2, 3)`. Input preprocessing is possible via `preprocessing_layers` and `preprocessing_combiner` Layers. If the `preprocessing_layers` nest is shallower than `input_tensor_spec`, then the layers will get the subnests. For example, if: ```python input_tensor_spec = ([TensorSpec(3)] * 2, [TensorSpec(3)] * 5) preprocessing_layers = (Layer1(), Layer2()) ``` then preprocessing will call: ```python preprocessed = [preprocessing_layers[0](observations[0]), preprocessing_layers[1](obsrevations[1])] ``` However if ```python preprocessing_layers = ([Layer1() for _ in range(2)], [Layer2() for _ in range(5)]) ``` then preprocessing will call: ```python preprocessed = [ layer(obs) for layer, obs in zip(flatten(preprocessing_layers), flatten(observations)) ] ``` **NOTE** `preprocessing_layers` and `preprocessing_combiner` are not allowed to have already been built. This ensures calls to `network.copy()` in the future always have an unbuilt, fresh set of parameters. Furtheremore, a shallow copy of the layers is always created by the Network, so the layer objects passed to the network are never modified. For more details of the semantics of `copy`, see the docstring of `tf_agents.networks.Network.copy`. Args: input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the input observations. preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer` representing preprocessing for the different observations. All of these layers must not be already built. preprocessing_combiner: (Optional.) A keras layer that takes a flat list of tensors and combines them. Good options include `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`. This layer must not be already built. conv_layer_params: Optional list of convolution layers parameters, where each item is either a length-three tuple indicating `(filters, kernel_size, stride)` or a length-four tuple indicating `(filters, kernel_size, stride, dilation_rate)`. fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. dropout_layer_params: Optional list of dropout layer parameters, each item is the fraction of input units to drop or a dictionary of parameters according to the keras.Dropout documentation. The additional parameter `permanent', if set to True, allows to apply dropout at inference for approximated Bayesian inference. The dropout layers are interleaved with the fully connected layers; there is a dropout layer after each fully connected layer, except if the entry in the list is None. This list must have the same length of fc_layer_params, or be None. activation_fn: Activation function, e.g. tf.keras.activations.relu. weight_decay_params: Optional list of weight decay parameters for the fully connected layers. kernel_initializer: Initializer to use for the kernels of the conv and dense layers. If none is provided a default variance_scaling_initializer batch_squash: If True the outer_ranks of the observation are squashed into the batch dimension. This allow encoding networks to be used with observations with shape [BxTx...]. dtype: The dtype to use by the convolution and fully connected layers. name: A string representing name of the network. conv_type: string, '1d' or '2d'. Convolution layers will be 1d or 2D respectively Raises: ValueError: If any of `preprocessing_layers` is already built. ValueError: If `preprocessing_combiner` is already built. ValueError: If the number of dropout layer parameters does not match the number of fully connected layer parameters. ValueError: If conv_layer_params tuples do not have 3 or 4 elements each. """ if preprocessing_layers is None: flat_preprocessing_layers = None else: flat_preprocessing_layers = [ _copy_layer(layer) for layer in tf.nest.flatten(preprocessing_layers) ] # Assert shallow structure is the same. This verifies preprocessing # layers can be applied on expected input nests. input_nest = input_tensor_spec # Given the flatten on preprocessing_layers above we need to make sure # input_tensor_spec is a sequence for the shallow_structure check below # to work. if not nest.is_sequence(input_tensor_spec): input_nest = [input_tensor_spec] nest.assert_shallow_structure(preprocessing_layers, input_nest, check_types=False) if (len(tf.nest.flatten(input_tensor_spec)) > 1 and preprocessing_combiner is None): raise ValueError( 'preprocessing_combiner layer is required when more than 1 ' 'input_tensor_spec is provided.') if preprocessing_combiner is not None: preprocessing_combiner = _copy_layer(preprocessing_combiner) if not kernel_initializer: kernel_initializer = tf.compat.v1.variance_scaling_initializer( scale=2.0, mode='fan_in', distribution='truncated_normal') layers = [] if conv_layer_params: if conv_type == '2d': conv_layer_type = tf.keras.layers.Conv2D elif conv_type == '1d': conv_layer_type = tf.keras.layers.Conv1D else: raise ValueError('unsupported conv type of %s. Use 1d or 2d' % (conv_type)) for config in conv_layer_params: if len(config) == 4: (filters, kernel_size, strides, dilation_rate) = config elif len(config) == 3: (filters, kernel_size, strides) = config dilation_rate = (1, 1) if conv_type == '2d' else (1, ) else: raise ValueError( 'only 3 or 4 elements permitted in conv_layer_params tuples' ) layers.append( conv_layer_type(filters=filters, kernel_size=kernel_size, strides=strides, dilation_rate=dilation_rate, activation=activation_fn, kernel_initializer=kernel_initializer, dtype=dtype)) layers.append(tf.keras.layers.Flatten()) if fc_layer_params: if dropout_layer_params is None: dropout_layer_params = [None] * len(fc_layer_params) else: if len(dropout_layer_params) != len(fc_layer_params): raise ValueError( 'Dropout and fully connected layer parameter lists' 'have different lengths (%d vs. %d.)' % (len(dropout_layer_params), len(fc_layer_params))) if weight_decay_params is None: weight_decay_params = [None] * len(fc_layer_params) else: if len(weight_decay_params) != len(fc_layer_params): raise ValueError( 'Weight decay and fully connected layer parameter ' 'lists have different lengths (%d vs. %d.)' % (len(weight_decay_params), len(fc_layer_params))) for num_units, dropout_params, weight_decay in zip( fc_layer_params, dropout_layer_params, weight_decay_params): kernal_regularizer = None if weight_decay is not None: kernal_regularizer = tf.keras.regularizers.l2(weight_decay) layers.append( tf.keras.layers.Dense( num_units, activation=activation_fn, kernel_initializer=kernel_initializer, kernel_regularizer=kernal_regularizer, dtype=dtype)) if not isinstance(dropout_params, dict): dropout_params = { 'rate': dropout_params } if dropout_params else None if dropout_params is not None: layers.append( utils.maybe_permanent_dropout(**dropout_params)) super(EncodingNetwork, self).__init__(input_tensor_spec=input_tensor_spec, state_spec=(), name=name) # Pull out the nest structure of the preprocessing layers. This avoids # saving the original kwarg layers as a class attribute which Keras would # then track. self._preprocessing_nest = tf.nest.map_structure( lambda l: None, preprocessing_layers) self._flat_preprocessing_layers = flat_preprocessing_layers self._preprocessing_combiner = preprocessing_combiner self._postprocessing_layers = layers self._batch_squash = batch_squash
def _linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias (default is all zeros). kernel_initializer: starting value to initialize the weight. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: #S = int(partial * output_size) #U = vs.get_variable(_WEIGHTS_VARIABLE_NAME+"_U", [total_arg_size, S],dtype=dtype,initializer=kernel_initializer) #V = vs.get_variable(_WEIGHTS_VARIABLE_NAME+"_V", [S, output_size],dtype=dtype,initializer=kernel_initializer) #weights = math_ops.matmul(U,V) weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) #print("weight created - ", total_arg_size, " on ", output_size) # eye = linalg_ops.eye(output_size) #corrMat = gen_array_ops.fill([output_size, output_size], welchBound) #eye = linalg_ops.eye(out) #corrMat = eye + (1.0 - eye) * corrMat #coherence_loss = coherence * math_ops.reduce_max(math_ops.abs(math_ops.matmul(weights, weights, transpose_a=True)) - corrMat) #coherence_loss = 0.0 #eye = linalg_ops.eye(output_size//4) #i_co #mat = weights[:,:output_size//4] #mat = math_ops.matmul(mat, mat, transpose_a=True) #coherence_loss = math_ops.reduce_max(mat) #coherence_loss += math_ops.abs(math_ops.reduce_max(math_ops.abs(math_ops.matmul(weights[:,:output_size//4], weights[:,:output_size//4], transpose_a=True))-eye)) #coherence_loss += math_ops.abs(math_ops.reduce_max(math_ops.abs(mat) - eye)) #f_co #coherence_loss += math_ops.abs(math_ops.reduce_max(math_ops.abs(math_ops.matmul(weights[:,output_size//4 : output_size//2], weights[:,output_size//4 : output_size//2], transpose_a=True))-eye)) #o_co #coherence_loss += math_ops.abs(math_ops.reduce_max(math_ops.abs(math_ops.matmul(weights[:,output_size//2 : 3*output_size//4], weights[:,output_size//2 : 3*output_size//4], transpose_a=True))-eye)) #g_co #coherence_loss += math_ops.abs(math_ops.reduce_max(math_ops.abs(math_ops.matmul(weights[:,3*output_size//4 :], weights[:,3*output_size//4 :], transpose_a=True))-eye)) #coherence_loss *= coherence*0.25#*(i_co+f_co+o_co+g_co) #ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, coherence_loss) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def custom_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None): """Creates a bidirectional recurrent neural network. Similar to the unidirectional case above (rnn) but takes input and builds independent forward and backward RNNs with the final forward and backward outputs depth-concatenated, such that the output will have the format [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: A length T list of inputs, each a tensor of shape [batch_size, input_size], or a nested tuple of such elements. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape `[batch_size, cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell_fw.state_size`. initial_state_bw: (optional) Same as for `initial_state_fw`, but using the corresponding properties of `cell_bw`. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. sequence_length: (optional) An int32/int64 vector, size `[batch_size]`, containing the actual lengths for each of the sequences. scope: VariableScope for the created subgraph; defaults to "BiRNN" Returns: A tuple (outputs, output_state_fw, output_state_bw) where: outputs is a length `T` list of outputs (one for each input), which are depth-concatenated forward and backward outputs. output_state_fw is the final state of the forward rnn. output_state_bw is the final state of the backward rnn. Raises: TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`. ValueError: If inputs is None or an empty list. """ if not isinstance(cell_fw, tf.compat.v1.nn.rnn_cell.RNNCell): raise TypeError("cell_fw must be an instance of RNNCell") if not isinstance(cell_bw, tf.compat.v1.nn.rnn_cell.RNNCell): raise TypeError("cell_bw must be an instance of RNNCell") if not nest.is_sequence(inputs): raise TypeError("inputs must be a sequence") if not inputs: raise ValueError("inputs must not be empty") with vs.variable_scope(scope or "bidirectional_rnn"): # Forward direction with vs.variable_scope("fw") as fw_scope: output_fw, output_state_fw, fw_states = custom_rnn( cell_fw, inputs, initial_state_fw, dtype, sequence_length, scope=fw_scope) # Backward direction with vs.variable_scope("bw") as bw_scope: reversed_inputs = _reverse_seq(inputs, sequence_length) tmp, output_state_bw, tmp_states = custom_rnn(cell_bw, reversed_inputs, initial_state_bw, dtype, sequence_length, scope=bw_scope) output_bw = _reverse_seq(tmp, sequence_length) bw_states = _reverse_seq(tmp_states, sequence_length) # Concat each of the forward/backward outputs flat_output_fw = nest.flatten(output_fw) flat_output_bw = nest.flatten(output_bw) flat_outputs = tuple( array_ops.concat(values=[fw, bw], axis=1) for fw, bw in zip(flat_output_fw, flat_output_bw)) outputs = nest.pack_sequence_as(structure=output_fw, flat_sequence=flat_outputs) return (outputs, output_state_fw, output_state_bw, fw_states, bw_states)
def _linear(args, output_size, bias, bias_start=0.0, weights_init=None, trainable=True, restore=True, reuse=False, scope=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Arguments: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (is_sequence(args) and not args): raise ValueError("`args` must be specified") if not is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] # Now the computation. with tf.variable_scope(scope or "Linear", reuse=reuse): matrix = va.variable("Matrix", [total_arg_size, output_size], initializer=weights_init, trainable=trainable, restore=restore) if len(args) == 1: res = tf.matmul(args[0], matrix) else: res = tf.matmul(array_ops.concat(args, 1), matrix) if not bias: return res bias_term = va.variable( "Bias", [output_size], initializer=tf.constant_initializer(bias_start), trainable=trainable, restore=restore) return res + bias_term
def static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None, fg=None): """Creates a recurrent neural network specified by RNNCell `cell`. The simplest form of RNN network generated is: ```python state = cell.zero_state(...) outputs = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) return (outputs, state) ``` However, a few other options are available: An initial state can be provided. If the sequence_length vector is provided, dynamic calculation is performed. This method of calculation does not compute the RNN steps past the maximum sequence length of the minibatch (thus saving computational time), and properly propagates the state at an example's sequence length to the final state output. The dynamic calculation performed is, at time `t` for batch row `b`, ```python (output, state)(b, t) = (t >= sequence_length(b)) ? (zeros(cell.output_size), states(b, sequence_length(b) - 1)) : cell(input(b, t), state(b, t - 1)) ``` Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a `Tensor` of shape `[batch_size, input_size]`, or a nested tuple of such elements. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. sequence_length: Specifies the length of each sequence in inputs. An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: - outputs is a length T list of outputs (one for each input), or a nested tuple of such elements. - state is the final state Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If `inputs` is `None` or an empty list, or if the input depth (column size) cannot be inferred from inputs via shape inference. """ # if not isinstance(cell, core_rnn_cell.RNNCell): # raise TypeError("cell must be an instance of RNNCell") # if not nest.is_sequence(inputs): # raise TypeError("inputs must be a sequence") # if not inputs: # raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Obtain the first sequence of the input first_input = inputs while nest.is_sequence(first_input): first_input = first_input[0] # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if first_input.get_shape().ndims != 1: input_shape = first_input.get_shape().with_rank_at_least(2) fixed_batch_size = input_shape[0] flat_inputs = nest.flatten(inputs) for flat_input in flat_inputs: input_shape = flat_input.get_shape().with_rank_at_least(2) batch_size, input_size = input_shape[0], input_shape[1:] fixed_batch_size.merge_with(batch_size) for i, size in enumerate(input_size): if size.value is None: raise ValueError( "Input size (dimension %d of inputs) must be accessible via " "shape inference, but saw value None." % i) else: fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(first_input)[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, " "dtype must be specified") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: # Prepare variables sequence_length = ops.convert_to_tensor( sequence_length, name="sequence_length") if sequence_length.get_shape().ndims not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size") def _create_zero_output(output_size): # convert int to TensorShape if necessary size = _state_size_with_prefix(output_size, prefix=[batch_size]) output = array_ops.zeros( array_ops.stack(size), _infer_state_dtype(dtype, state)) shape = _state_size_with_prefix( output_size, prefix=[fixed_batch_size.value]) output.set_shape(tensor_shape.TensorShape(shape)) return output output_size = cell.output_size flat_output_size = nest.flatten(output_size) flat_zero_output = tuple( _create_zero_output(size) for size in flat_output_size) zero_output = nest.pack_sequence_as(structure=output_size, flat_sequence=flat_zero_output) sequence_length = math_ops.to_int32(sequence_length) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: varscope.reuse_variables() # pylint: disable=cell-var-from-loop #if fg: call_cell = lambda: cell(input_, state)#.apply(fg, state) #else : call_cell = lambda: cell(input_, state) call_cell = lambda: cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=cell.state_size) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def _dynamic_rnn_loop( cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, input_size]. initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if `cell.state_size` is a tuple, then this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. Returns: Tuple `(final_outputs, final_state)`. final_outputs: A `Tensor` of shape `[time, batch_size, cell.output_size]`. final_state: A `Tensor` matrix, or tuple of such matrices, matching in length and shapes to `initial_state`. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" # Construct an initial output input_shape = array_ops.shape(inputs) time_steps = input_shape[0] batch_size = input_shape[1] inputs_got_shape = inputs.get_shape().with_rank_at_least(3).as_list() const_time_steps = inputs_got_shape[0] const_batch_size = inputs_got_shape[1] const_depth = inputs_got_shape[2:] if const_depth is None: raise ValueError( "Input size (depth of inputs) must be accessible via shape inference, " "but saw value None.") # Prepare dynamic conditional copying of state & output zeros_size = _state_size_with_prefix(cell.output_size, prefix=[batch_size]) zero_output = array_ops.zeros(array_ops.pack(zeros_size), inputs.dtype) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") state_size = cell.state_size state_is_tuple = nest.is_sequence(state_size) state = nest.flatten(state) if state_is_tuple else (state,) with ops.op_scope([], "dynamic_rnn") as scope: base_name = scope output_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "output") input_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "input") input_ta = input_ta.unpack(inputs) def _time_step(time, output_ta_t, *state): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. output_ta_t: `TensorArray`, the output with existing flow. *state: List of vector tensors. Returns: The tuple (time + 1, output_ta_t with updated flow) + new_state. """ input_t = input_ta.read(time) # Restore some shape information input_t.set_shape([const_batch_size] + const_depth) # Pack state back up for use by cell state = (nest.pack_sequence_as(structure=state_size, flat_sequence=state) if state_is_tuple else state[0]) call_cell = lambda: cell(input_t, state) if sequence_length is not None: (output, new_state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=state_size, skip_conditionals=True) else: (output, new_state) = call_cell() # Pack state if using state tuples new_state = ( tuple(nest.flatten(new_state)) if state_is_tuple else (new_state,)) output_ta_t = output_ta_t.write(time, output) return (time + 1, output_ta_t) + new_state final_loop_vars = control_flow_ops.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, output_ta) + tuple(state), parallel_iterations=parallel_iterations, swap_memory=swap_memory) (output_final_ta, final_state) = (final_loop_vars[1], final_loop_vars[2:]) final_outputs = output_final_ta.pack() # Restore some shape information final_outputs_size = _state_size_with_prefix( cell.output_size, prefix=[const_time_steps, const_batch_size]) final_outputs.set_shape(final_outputs_size) # Unpack final state if not using state tuples. final_state = ( nest.pack_sequence_as( structure=cell.state_size, flat_sequence=final_state) if state_is_tuple else final_state[0]) return (final_outputs, final_state)
def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True, swap_memory=False, infer_shape=True, name=None): """map on the list of tensors unpacked from `elems` on dimension 0. The simplest version of `map_fn` repeatedly applies the callable `fn` to a sequence of elements from first to last. The elements are made of the tensors unpacked from `elems`. `dtype` is the data type of the return value of `fn`. Users must provide `dtype` if it is different from the data type of `elems`. Suppose that `elems` is unpacked into `values`, a list of tensors. The shape of the result tensor is `[values.shape[0]] + fn(values[0]).shape`. This method also allows multi-arity `elems` and output of `fn`. If `elems` is a (possibly nested) list or tuple of tensors, then each of these tensors must have a matching first (unpack) dimension. The signature of `fn` may match the structure of `elems`. That is, if `elems` is `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is: `fn = lambda (t1, [t2, t3, [t4, t5]]):`. Furthermore, `fn` may emit a different structure than its input. For example, `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`. In this case, the `dtype` parameter is not optional: `dtype` must be a type or (possibly nested) tuple of types matching the output of `fn`. To apply a functional operation to the nonzero elements of a SparseTensor one of the following methods is recommended. First, if the function is expressible as TensorFlow ops, use ```python result = SparseTensor(input.indices, fn(input.values), input.dense_shape) ``` If, however, the function is not expressible as a TensorFlow op, then use ```python result = SparseTensor( input.indices, map_fn(fn, input.values), input.dense_shape) ``` instead. When executing eagerly, map_fn does not execute in parallel even if `parallel_iterations` is set to a value > 1. You can still get the performance benefits of running a function in parallel by using the `tf.contrib.eager.defun` decorator, ```python # Assume the function being used in map_fn is fn. # To ensure map_fn calls fn in parallel, use the defun decorator. @tf.contrib.eager.defun def func(tensor): return tf.map_fn(fn, tensor) ``` Note that if you use the defun decorator, any non-TensorFlow Python code that you may have written in your function won't get executed. See `tf.contrib.eager.defun` for more details. The recommendation would be to debug without defun but switch to defun to get performance benefits of running map_fn in parallel. Args: fn: The callable to be performed. It accepts one argument, which will have the same (possibly nested) structure as `elems`. Its output must have the same structure as `dtype` if one is provided, otherwise it must have the same structure as `elems`. elems: A tensor or (possibly nested) sequence of tensors, each of which will be unpacked along their first dimension. The nested sequence of the resulting slices will be applied to `fn`. dtype: (optional) The output type(s) of `fn`. If `fn` returns a structure of Tensors differing from the structure of `elems`, then `dtype` is not optional and must have the same structure as the output of `fn`. parallel_iterations: (optional) The number of iterations allowed to run in parallel. When graph building, the default value is 10. While executing eagerly, the default value is set to 1. back_prop: (optional) True enables support for back propagation. swap_memory: (optional) True enables GPU-CPU memory swapping. infer_shape: (optional) False disables tests for consistent output shapes. name: (optional) Name prefix for the returned tensors. Returns: A tensor or (possibly nested) sequence of tensors. Each tensor packs the results of applying `fn` to tensors unpacked from `elems` along the first dimension, from first to last. Raises: TypeError: if `fn` is not callable or the structure of the output of `fn` and `dtype` do not match, or if elems is a SparseTensor. ValueError: if the lengths of the output of `fn` and `dtype` do not match. Examples: ```python elems = np.array([1, 2, 3, 4, 5, 6]) squares = map_fn(lambda x: x * x, elems) # squares == [1, 4, 9, 16, 25, 36] ``` ```python elems = (np.array([1, 2, 3]), np.array([-1, 1, -1])) alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64) # alternate == [-1, 2, -3] ``` ```python elems = np.array([1, 2, 3]) alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64)) # alternates[0] == [1, 2, 3] # alternates[1] == [-1, -2, -3] ``` """ if not callable(fn): raise TypeError("fn must be callable.") if isinstance(elems, sparse_tensor.SparseTensor): raise TypeError( "To perform a map on the values of a sparse tensor use either " " SparseTensor(input.indices, fn(input.values), input.dense_shape) or " " SparseTensor(input.indices, map_fn(fn, input.values), " "input.dense_shape)") in_graph_mode = not context.executing_eagerly() # Set the default number of parallel_iterations depending on graph/eager mode. if in_graph_mode and not parallel_iterations: parallel_iterations = 10 elif not in_graph_mode and not parallel_iterations: parallel_iterations = 1 if not in_graph_mode and parallel_iterations > 1: logging.log_first_n( logging.WARN, "Setting parallel_iterations > 1 has no " "effect when executing eagerly. Consider calling map_fn" " with tf.contrib.eager.defun to execute fn in " "parallel.", 1) parallel_iterations = 1 input_is_sequence = nest.is_sequence(elems) input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x] def input_pack(x): return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0] if dtype is None: output_is_sequence = input_is_sequence output_flatten = input_flatten output_pack = input_pack else: output_is_sequence = nest.is_sequence(dtype) output_flatten = lambda x: nest.flatten( x) if output_is_sequence else [x] def output_pack(x): return (nest.pack_sequence_as(dtype, x) if output_is_sequence else x[0]) elems_flat = input_flatten(elems) with ops.name_scope(name, "map", elems_flat): # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode: # Any get_variable calls in fn will cache the first call locally # and not issue repeated network I/O requests for each iteration. varscope = vs.get_variable_scope() varscope_caching_device_was_none = False if varscope.caching_device is None: # TODO(ebrevdo): Change to using colocate_with here and in other # methods. varscope.set_caching_device(lambda op: op.device) varscope_caching_device_was_none = True elems_flat = [ ops.convert_to_tensor(elem, name="elem") for elem in elems_flat ] dtype = dtype or input_pack([elem.dtype for elem in elems_flat]) dtype_flat = output_flatten(dtype) # Convert elems to tensor array. n may be known statically. static_shape = elems_flat[0].shape if static_shape.ndims is not None and static_shape.ndims < 1: if len(elems_flat) == 1: raise ValueError( "elems must be a 1+ dimensional Tensor, not a scalar") else: raise ValueError( "elements in elems must be 1+ dimensional Tensors, not scalars" ) n = (tensor_shape.dimension_value(static_shape[0]) or array_ops.shape(elems_flat[0])[0]) # TensorArrays are always flat elems_ta = [ tensor_array_ops.TensorArray(dtype=elem.dtype, size=n, dynamic_size=False, infer_shape=True) for elem in elems_flat ] # Unpack elements elems_ta = [ elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat) ] i = constant_op.constant(0) accs_ta = [ tensor_array_ops.TensorArray(dtype=dt, size=n, dynamic_size=False, infer_shape=infer_shape) for dt in dtype_flat ] def compute(i, tas): """The loop body of map_fn. Args: i: the loop counter tas: the flat TensorArray accumulator list Returns: (i + 1, tas): the updated counter + updated TensorArrays Raises: TypeError: if dtype and packed_fn_values structure do not match ValueType: if dtype and packed_fn_values lengths do not match """ packed_values = input_pack( [elem_ta.read(i) for elem_ta in elems_ta]) packed_fn_values = fn(packed_values) nest.assert_same_structure(dtype or elems, packed_fn_values) flat_fn_values = output_flatten(packed_fn_values) tas = [ ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values) ] return (i + 1, tas) _, r_a = control_flow_ops.while_loop( lambda i, _: i < n, compute, (i, accs_ta), parallel_iterations=parallel_iterations, back_prop=back_prop, swap_memory=swap_memory, maximum_iterations=n) results_flat = [r.stack() for r in r_a] n_static = tensor_shape.Dimension( tensor_shape.dimension_value( elems_flat[0].get_shape().with_rank_at_least(1)[0])) for elem in elems_flat[1:]: n_static.merge_with( tensor_shape.Dimension( tensor_shape.dimension_value( elem.get_shape().with_rank_at_least(1)[0]))) for r in results_flat: r.set_shape( tensor_shape.TensorShape(n_static).concatenate( r.get_shape()[1:])) # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode and varscope_caching_device_was_none: varscope.set_caching_device(None) return output_pack(results_flat)
def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None): """map on the list of tensors unpacked from `elems` on dimension 0. The simplest version of `map` repeatedly applies the callable `fn` to a sequence of elements from first to last. The elements are made of the tensors unpacked from `elems`. `dtype` is the data type of the return value of `fn`. Users must provide `dtype` if it is different from the data type of `elems`. Suppose that `elems` is unpacked into `values`, a list of tensors. The shape of the result tensor is `[values.shape[0]] + fn(values[0]).shape`. This method also allows multi-arity `elems` and output of `fn`. If `elems` is a (possibly nested) list or tuple of tensors, then each of these tensors must have a matching first (unpack) dimension. The signature of `fn` may match the structure of `elems`. That is, if `elems` is `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is: `fn = lambda (t1, [t2, t3, [t4, t5]]):`. Furthermore, `fn` may emit a different structure than its input. For example, `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`. In this case, the `dtype` parameter is not optional: `dtype` must be a type or (possibly nested) tuple of types matching the output of `fn`. To apply a functional operation to the nonzero elements of a SparseTensor one of the following methods is recommended. First, if the function is expressible as TensorFlow ops, use ```python result = SparseTensor(input.indices, fn(input.values), input.dense_shape) ``` If, however, the function is not expressible as a TensorFlow op, then use ```python result = SparseTensor( input.indices, map_fn(fn, input.values), input.dense_shape) ``` instead. Args: fn: The callable to be performed. It accepts one argument, which will have the same (possibly nested) structure as `elems`. Its output must have the same structure as `dtype` if one is provided, otherwise it must have the same structure as `elems`. elems: A tensor or (possibly nested) sequence of tensors, each of which will be unpacked along their first dimension. The nested sequence of the resulting slices will be applied to `fn`. dtype: (optional) The output type(s) of `fn`. If `fn` returns a structure of Tensors differing from the structure of `elems`, then `dtype` is not optional and must have the same structure as the output of `fn`. parallel_iterations: (optional) The number of iterations allowed to run in parallel. back_prop: (optional) True enables support for back propagation. swap_memory: (optional) True enables GPU-CPU memory swapping. infer_shape: (optional) False disables tests for consistent output shapes. name: (optional) Name prefix for the returned tensors. Returns: A tensor or (possibly nested) sequence of tensors. Each tensor packs the results of applying `fn` to tensors unpacked from `elems` along the first dimension, from first to last. Raises: TypeError: if `fn` is not callable or the structure of the output of `fn` and `dtype` do not match, or if elems is a SparseTensor. ValueError: if the lengths of the output of `fn` and `dtype` do not match. Examples: ```python elems = np.array([1, 2, 3, 4, 5, 6]) squares = map_fn(lambda x: x * x, elems) # squares == [1, 4, 9, 16, 25, 36] ``` ```python elems = (np.array([1, 2, 3]), np.array([-1, 1, -1])) alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64) # alternate == [-1, 2, -3] ``` ```python elems = np.array([1, 2, 3]) alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64)) # alternates[0] == [1, 2, 3] # alternates[1] == [-1, -2, -3] ``` """ if not callable(fn): raise TypeError("fn must be callable.") if isinstance(elems, sparse_tensor.SparseTensor): raise TypeError( "To perform a map on the values of a sparse tensor use either " " SparseTensor(input.indices, fn(input.values), input.dense_shape) or " " SparseTensor(input.indices, map_fn(fn, input.values), " "input.dense_shape)") input_is_sequence = nest.is_sequence(elems) input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x] def input_pack(x): return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0] if dtype is None: output_is_sequence = input_is_sequence output_flatten = input_flatten output_pack = input_pack else: output_is_sequence = nest.is_sequence(dtype) output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x] def output_pack(x): return (nest.pack_sequence_as(dtype, x) if output_is_sequence else x[0]) elems_flat = input_flatten(elems) with ops.name_scope(name, "map", elems_flat): # Any get_variable calls in fn will cache the first call locally # and not issue repeated network I/O requests for each iteration. varscope = vs.get_variable_scope() varscope_caching_device_was_none = False if varscope.caching_device is None: # TODO(ebrevdo): Change to using colocate_with here and in other methods. varscope.set_caching_device(lambda op: op.device) varscope_caching_device_was_none = True elems_flat = [ ops.convert_to_tensor(elem, name="elem") for elem in elems_flat] dtype = dtype or input_pack([elem.dtype for elem in elems_flat]) dtype_flat = output_flatten(dtype) # Convert elems to tensor array. n = array_ops.shape(elems_flat[0])[0] # TensorArrays are always flat elems_ta = [ tensor_array_ops.TensorArray(dtype=elem.dtype, size=n, dynamic_size=False, infer_shape=True) for elem in elems_flat] # Unpack elements elems_ta = [ elem_ta.unpack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)] i = constant_op.constant(0) accs_ta = [ tensor_array_ops.TensorArray(dtype=dt, size=n, dynamic_size=False, infer_shape=infer_shape) for dt in dtype_flat] def compute(i, tas): """The loop body of map_fn. Args: i: the loop counter tas: the flat TensorArray accumulator list Returns: (i + 1, tas): the updated counter + updated TensorArrays Raises: TypeError: if dtype and packed_fn_values structure do not match ValueType: if dtype and packed_fn_values lengths do not match """ packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta]) packed_fn_values = fn(packed_values) nest.assert_same_structure(dtype or elems, packed_fn_values) flat_fn_values = output_flatten(packed_fn_values) tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)] return (i + 1, tas) _, r_a = control_flow_ops.while_loop( lambda i, _: i < n, compute, (i, accs_ta), parallel_iterations=parallel_iterations, back_prop=back_prop, swap_memory=swap_memory) results_flat = [r.pack() for r in r_a] n_static = elems_flat[0].get_shape().with_rank_at_least(1)[0] for elem in elems_flat[1:]: n_static.merge_with(elem.get_shape().with_rank_at_least(1)[0]) for r in results_flat: r.set_shape(tensor_shape.TensorShape(n_static).concatenate( r.get_shape()[1:])) if varscope_caching_device_was_none: varscope.set_caching_device(None) return output_pack(results_flat)
def __init__(self, cell, beam_size, stop_token, initial_state, initial_input, score_upper_bound=None, max_len=100, outputs_to_score_fn=None, tokens_to_inputs_fn=None, cell_transform='default', scope=None): self.beam_size = beam_size self.stop_token = stop_token self.max_len = max_len self.scope = scope if score_upper_bound is None and outputs_to_score_fn is None: self.score_upper_bound = 0.0 elif score_upper_bound is None or score_upper_bound > 3e38: # Note: 3e38 is just a little smaller than the largest float32 # Second condition allows for Infinity as a synonym for None self.score_upper_bound = None else: self.score_upper_bound = float(score_upper_bound) if self.max_len is None and self.score_upper_bound is None: raise ValueError( "Beam search needs a stopping criterion. Please provide max_len or score_upper_bound." ) if cell_transform == 'default': if type(cell) in [LSTMCell, BasicLSTMCell, MultiRNNCell]: cell_transform = 'flatten' else: cell_transform = 'replicate' if cell_transform == 'none': self.cell = cell self.initial_state = initial_state self.initial_input = initial_input elif cell_transform == 'flatten': self.cell = BeamFlattenWrapper(cell, self.beam_size) self.initial_state = self.cell.tile_along_beam(initial_state) self.initial_input = self.cell.tile_along_beam(initial_input) elif cell_transform == 'replicate': self.cell = BeamReplicateWrapper(cell, self.beam_size) self.initial_state = self.cell.tile_along_beam(initial_state) self.initial_input = self.cell.tile_along_beam(initial_input) else: raise ValueError( "cell_transform must be one of: 'default', 'flatten', 'replicate', 'none'" ) self._cell_transform_used = cell_transform if outputs_to_score_fn is not None: self.outputs_to_score_fn = outputs_to_score_fn if tokens_to_inputs_fn is not None: self.tokens_to_inputs_fn = tokens_to_inputs_fn batch_size = tf.Dimension(None) if not nest.is_sequence(self.initial_state): batch_size = batch_size.merge_with( self.initial_state.get_shape()[0]) else: for tensor in nest.flatten(self.initial_state): batch_size = batch_size.merge_with(tensor.get_shape()[0]) if not nest.is_sequence(self.initial_input): batch_size = batch_size.merge_with( self.initial_input.get_shape()[0]) else: for tensor in nest.flatten(self.initial_input): batch_size = batch_size.merge_with(tensor.get_shape()[0]) self.inferred_batch_size = batch_size.value if self.inferred_batch_size is not None: self.batch_size = self.inferred_batch_size else: if not nest.is_sequence(self.initial_state): self.batch_size = tf.shape(self.initial_state)[0] else: self.batch_size = tf.shape( list(nest.flatten(self.initial_state))[0])[0] self.inferred_batch_size_times_beam_size = None if self.inferred_batch_size is not None: self.inferred_batch_size_times_beam_size = self.inferred_batch_size * self.beam_size self.batch_size_times_beam_size = self.batch_size * self.beam_size
def ensure_square(shape): if not nest.is_sequence(shape): shape = (shape, shape) return shape
def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, reverse=False, name=None): """scan on the list of tensors unpacked from `elems` on dimension 0. The simplest version of `scan` repeatedly applies the callable `fn` to a sequence of elements from first to last. The elements are made of the tensors unpacked from `elems` on dimension 0. The callable fn takes two tensors as arguments. The first argument is the accumulated value computed from the preceding invocation of fn. If `initializer` is None, `elems` must contain at least one element, and its first element is used as the initializer. Suppose that `elems` is unpacked into `values`, a list of tensors. The shape of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`. If reverse=True, it's fn(initializer, values[-1]).shape. This method also allows multi-arity `elems` and accumulator. If `elems` is a (possibly nested) list or tuple of tensors, then each of these tensors must have a matching first (unpack) dimension. The second argument of `fn` must match the structure of `elems`. If no `initializer` is provided, the output structure and dtypes of `fn` are assumed to be the same as its input; and in this case, the first argument of `fn` must match the structure of `elems`. If an `initializer` is provided, then the output of `fn` must have the same structure as `initializer`; and the first argument of `fn` must match this structure. For example, if `elems` is `(t1, [t2, t3])` and `initializer` is `[i1, i2]` then an appropriate signature for `fn` in `python2` is: `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list, `[acc_n1, acc_n2]`. An alternative correct signature for `fn`, and the one that works in `python3`, is: `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples. Args: fn: The callable to be performed. It accepts two arguments. The first will have the same structure as `initializer` if one is provided, otherwise it will have the same structure as `elems`. The second will have the same (possibly nested) structure as `elems`. Its output must have the same structure as `initializer` if one is provided, otherwise it must have the same structure as `elems`. elems: A tensor or (possibly nested) sequence of tensors, each of which will be unpacked along their first dimension. The nested sequence of the resulting slices will be the first argument to `fn`. initializer: (optional) A tensor or (possibly nested) sequence of tensors, initial value for the accumulator, and the expected output type of `fn`. parallel_iterations: (optional) The number of iterations allowed to run in parallel. back_prop: (optional) True enables support for back propagation. swap_memory: (optional) True enables GPU-CPU memory swapping. infer_shape: (optional) False disables tests for consistent output shapes. reverse: (optional) True scans the tensor last to first (instead of first to last). name: (optional) Name prefix for the returned tensors. Returns: A tensor or (possibly nested) sequence of tensors. Each tensor packs the results of applying `fn` to tensors unpacked from `elems` along the first dimension, and the previous accumulator value(s), from first to last (or last to first, if `reverse=True`). Raises: TypeError: if `fn` is not callable or the structure of the output of `fn` and `initializer` do not match. ValueError: if the lengths of the output of `fn` and `initializer` do not match. Examples: ```python elems = np.array([1, 2, 3, 4, 5, 6]) sum = scan(lambda a, x: a + x, elems) # sum == [1, 3, 6, 10, 15, 21] sum = scan(lambda a, x: a + x, elems, reverse=True) # sum == [21, 20, 18, 15, 11, 6] ``` ```python elems = np.array([1, 2, 3, 4, 5, 6]) initializer = np.array(0) sum_one = scan( lambda a, x: x[0] - x[1] + a, (elems + 1, elems), initializer) # sum_one == [1, 2, 3, 4, 5, 6] ``` ```python elems = np.array([1, 0, 0, 0, 0, 0]) initializer = (np.array(0), np.array(1)) fibonaccis = scan(lambda a, _: (a[1], a[0] + a[1]), elems, initializer) # fibonaccis == ([1, 1, 2, 3, 5, 8], [1, 2, 3, 5, 8, 13]) ``` """ if not callable(fn): raise TypeError("fn must be callable.") input_is_sequence = nest.is_sequence(elems) input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x] def input_pack(x): return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0] if initializer is None: output_is_sequence = input_is_sequence output_flatten = input_flatten output_pack = input_pack else: output_is_sequence = nest.is_sequence(initializer) output_flatten = lambda x: nest.flatten( x) if output_is_sequence else [x] def output_pack(x): return (nest.pack_sequence_as(initializer, x) if output_is_sequence else x[0]) elems_flat = input_flatten(elems) in_graph_mode = not context.executing_eagerly() with ops.name_scope(name, "scan", elems_flat): # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode: # Any get_variable calls in fn will cache the first call locally # and not issue repeated network I/O requests for each iteration. varscope = vs.get_variable_scope() varscope_caching_device_was_none = False if varscope.caching_device is None: # TODO(ebrevdo): Change to using colocate_with here and in other # methods. varscope.set_caching_device(lambda op: op.device) varscope_caching_device_was_none = True # Convert elems to tensor array. elems_flat = [ ops.convert_to_tensor(elem, name="elem") for elem in elems_flat ] # Convert elems to tensor array. n may be known statically. n = tensor_shape.dimension_value(elems_flat[0].shape[0]) if n is None: n = array_ops.shape(elems_flat[0])[0] # TensorArrays are always flat elems_ta = [ tensor_array_ops.TensorArray(dtype=elem.dtype, size=n, dynamic_size=False, element_shape=elem.shape[1:], infer_shape=True) for elem in elems_flat ] # Unpack elements elems_ta = [ elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat) ] if initializer is None: a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta] i = 1 else: initializer_flat = output_flatten(initializer) a_flat = [ops.convert_to_tensor(init) for init in initializer_flat] i = 0 # Create a tensor array to store the intermediate values. accs_ta = [ tensor_array_ops.TensorArray( dtype=init.dtype, size=n, element_shape=init.shape if infer_shape else None, dynamic_size=False, infer_shape=infer_shape) for init in a_flat ] if initializer is None: accs_ta = [ acc_ta.write(n - 1 if reverse else 0, a) for (acc_ta, a) in zip(accs_ta, a_flat) ] def compute(i, a_flat, tas): """The loop body of scan. Args: i: the loop counter. a_flat: the accumulator value(s), flattened. tas: the output accumulator TensorArray(s), flattened. Returns: [i + 1, a_flat, tas]: the updated counter + new accumulator values + updated TensorArrays Raises: TypeError: if initializer and fn() output structure do not match ValueType: if initializer and fn() output lengths do not match """ packed_elems = input_pack( [elem_ta.read(i) for elem_ta in elems_ta]) packed_a = output_pack(a_flat) a_out = fn(packed_a, packed_elems) nest.assert_same_structure( elems if initializer is None else initializer, a_out) flat_a_out = output_flatten(a_out) tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)] if reverse: next_i = i - 1 else: next_i = i + 1 return (next_i, flat_a_out, tas) if reverse: initial_i = n - 1 - i condition = lambda i, _1, _2: i >= 0 else: initial_i = i condition = lambda i, _1, _2: i < n _, _, r_a = control_flow_ops.while_loop( condition, compute, (initial_i, a_flat, accs_ta), parallel_iterations=parallel_iterations, back_prop=back_prop, swap_memory=swap_memory, maximum_iterations=n) results_flat = [r.stack() for r in r_a] n_static = tensor_shape.Dimension( tensor_shape.dimension_value( elems_flat[0].get_shape().with_rank_at_least(1)[0])) for elem in elems_flat[1:]: n_static.merge_with( tensor_shape.Dimension( tensor_shape.dimension_value( elem.get_shape().with_rank_at_least(1)[0]))) for r in results_flat: r.set_shape( tensor_shape.TensorShape(n_static).concatenate( r.get_shape()[1:])) # TODO(akshayka): Remove the in_graph_mode check once caching devices are # supported in Eager if in_graph_mode and varscope_caching_device_was_none: varscope.set_caching_device(None) return output_pack(results_flat)
def __init__(self, args, output_size, build_bias, bias_initializer=None, kernel_initializer=None): self._build_bias = build_bias if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] self._is_sequence = False else: self._is_sequence = True # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: self._weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if build_bias: with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer( 0.0, dtype=dtype) self._biases = vs.get_variable( _BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer)
def __call__(self, cell_inputs, state, scope=None): ( past_beam_symbols, # [batch_size*self.beam_size, :], right-aligned!!! past_beam_logprobs, # [batch_size*self.beam_size] past_cell_states # LSTM: ([batch_size*self.beam_size, :, dim], # [batch_size*self.beam_size, :, dim]) # GRU: [batch_size*self.beam_size, :, dim] ) = state past_cell_state = self.get_last_cell_state(past_cell_states) if self.use_copy and self.copy_fun == 'copynet': cell_output, cell_state, alignments, attns = \ self.cell(cell_inputs, past_cell_state, scope) elif self.use_attention: cell_output, cell_state, alignments, attns = \ self.cell(cell_inputs, past_cell_state, scope) else: cell_output, cell_state = \ self.cell(cell_inputs, past_cell_state, scope) # [batch_size*beam_size, num_classes] if self.use_copy and self.copy_fun == 'copynet': logprobs = tf.math.log(cell_output) else: W, b = self.output_project if self.locally_normalized: logprobs = tf.nn.log_softmax(tf.matmul(cell_output, W) + b) else: logprobs = tf.matmul(cell_output, W) + b num_classes = logprobs.get_shape()[1] # stop_mask: indicates partial sequences ending with a stop token # [batch_size * beam_size] # x 0 # _STOP 1 # x 0 # x 0 input_symbols = past_beam_symbols[:, -1] stop_mask = tf.expand_dims(tf.cast( tf.equal(input_symbols, self.stop_token), tf.float32), 1) # done_mask: indicates stop token in the output vocabulary # [1, num_classes] # [- - _STOP - - -] # [0 0 1 0 0 0] done_mask = tf.cast(tf.reshape(tf.equal(tf.range(num_classes), self.stop_token), [1, num_classes]), tf.float32) # set the next token distribution of partial sequences ending with # a stop token to: # [- - _STOP - - -] # [-inf -inf 0 -inf -inf -inf] logprobs = tf.add(logprobs, tf.multiply( stop_mask, -1e18 * (tf.ones_like(done_mask) - done_mask))) logprobs = tf.multiply(logprobs, (1 - tf.multiply(stop_mask, done_mask))) # length normalization past_logprobs_unormalized = \ tf.multiply(past_beam_logprobs, tf.pow(self.seq_len, self.alpha)) logprobs_unormalized = \ tf.expand_dims(past_logprobs_unormalized, 1) + logprobs seq_len = tf.expand_dims(self.seq_len, 1) + (1 - stop_mask) logprobs_batched = tf.compat.v1.div(logprobs_unormalized, tf.pow(seq_len, self.alpha)) beam_logprobs, indices = tf.nn.top_k( tf.reshape(logprobs_batched, [-1, self.beam_size * num_classes]), self.beam_size ) beam_logprobs = tf.reshape(beam_logprobs, [-1]) # For continuing to the next symbols parent_refs_offsets = \ (tf.range(self.full_size) // self.beam_size) * self.beam_size symbols = indices % num_classes # [batch_size, self.beam_size] parent_refs = tf.reshape(indices // num_classes, [-1]) # [batch_size*self.beam_size] parent_refs = parent_refs + parent_refs_offsets beam_symbols = tf.concat(axis=1, values=[tf.gather(past_beam_symbols, parent_refs), tf.reshape(symbols, [-1, 1])]) self.seq_len = tf.squeeze(tf.gather(seq_len, parent_refs), axis=[1]) if self.use_attention: ranked_alignments = nest_map( lambda element: tf.gather(element, parent_refs), alignments) ranked_attns = nest_map( lambda element: tf.gather(element, parent_refs), attns) # update cell_states def concat_and_gather_tuple_states(pc_states, c_state): rc_states = ( tf.concat(axis=1, values=[pc_states[0], tf.expand_dims(c_state[0], 1)]), tf.concat(axis=1, values=[pc_states[1], tf.expand_dims(c_state[1], 1)]) ) c_states = ( nest_map(lambda element: tf.gather(element, parent_refs), rc_states[0]), nest_map(lambda element: tf.gather(element, parent_refs), rc_states[1]) ) return c_states if nest.is_sequence(cell_state): if self.num_layers > 1: ranked_cell_states = [concat_and_gather_tuple_states(pc_states, c_state) for pc_states, c_state in zip(past_cell_states, cell_state)] else: ranked_cell_states = concat_and_gather_tuple_states( past_cell_states, cell_state) else: ranked_cell_states = tf.gather( tf.concat(axis=1, values=[past_cell_states, tf.expand_dims(cell_state, 1)]), parent_refs) compound_cell_state = ( beam_symbols, beam_logprobs, ranked_cell_states ) ranked_cell_output = tf.gather(cell_output, parent_refs) if self.use_copy and self.copy_fun == 'copynet': return ranked_cell_output, compound_cell_state, ranked_alignments, \ ranked_attns elif self.use_attention: return ranked_cell_output, compound_cell_state, ranked_alignments, \ ranked_attns else: return ranked_cell_output, compound_cell_state
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Warning: when output_projection is None, the size of the attention vectors and variables will be made proportional to num_decoder_symbols, can be large. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope( scope or "embedding_attention_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] # attention_states = array_ops.concat(top_states, 1) attention_states = array_ops.concat(1, top_states) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as( structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None): """One-to-many RNN sequence-to-sequence model (multi-task). This is a multi-task sequence-to-sequence model with one encoder and multiple decoders. Reference to multi-task sequence-to-sequence learning can be found here: http://arxiv.org/abs/1511.06114 Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs_dict: A dictionany mapping decoder name (string) to the corresponding decoder_inputs; each decoder_inputs is a list of 1D Tensors of shape [batch_size]; num_decoders is defined as len(decoder_inputs_dict). cell: core_rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an integer specifying number of symbols for the corresponding decoder; len(num_decoder_symbols_dict) must be equal to num_decoders. embedding_size: Integer, the length of the embedding vector for each symbol. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial state for both the encoder and encoder rnn cells (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "one2many_rnn_seq2seq" Returns: A tuple of the form (outputs_dict, state_dict), where: outputs_dict: A mapping from decoder name (string) to a list of the same length as decoder_inputs_dict[name]; each element in the list is a 2D Tensors with shape [batch_size x num_decoder_symbol_list[name]] containing the generated outputs. state_dict: A mapping from decoder name (string) to the final state of the corresponding decoder RNN; it is a 2D Tensor of shape [batch_size x cell.state_size]. """ outputs_dict = {} state_dict = {} with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = core_rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) _, encoder_state = core_rnn.static_rnn(encoder_cell, encoder_inputs, dtype=dtype) # Decoder. for name, decoder_inputs in decoder_inputs_dict.items(): num_decoder_symbols = num_decoder_symbols_dict[name] with variable_scope.variable_scope("one2many_decoder_" + str(name)) as scope: decoder_cell = core_rnn_cell.OutputProjectionWrapper( cell, num_decoder_symbols) if isinstance(feed_previous, bool): outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, embedding_size, feed_previous=feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. def filled_embedding_rnn_decoder(feed_previous): """The current decoder with a fixed feed_previous parameter.""" # pylint: disable=cell-var-from-loop reuse = None if feed_previous else True vs = variable_scope.get_variable_scope() with variable_scope.variable_scope(vs, reuse=reuse): outputs, state = embedding_rnn_decoder( decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, embedding_size, feed_previous=feed_previous) # pylint: enable=cell-var-from-loop state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond( feed_previous, lambda: filled_embedding_rnn_decoder(True), lambda: filled_embedding_rnn_decoder(False)) # Outputs length is the same as for decoder inputs. outputs_len = len(decoder_inputs) outputs = outputs_and_state[:outputs_len] state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) outputs_dict[name] = outputs state_dict[name] = state return outputs_dict, state_dict
def dynamic_seq2seq(encoder_inputs, decoder_inputs, query_inputs, cell_encoder_fw, cell_encoder_bw, distraction_cell, num_encoder_symbols, num_decoder_symbols, embedding_size, initial_embedding = None, num_heads=1, embedding_trainable=False, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope( scope or "dynamic_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. """encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) """ if initial_embedding is not None: embedding = variable_scope.get_variable('embedding', initializer=initial_embedding, trainable=embedding_trainable) else: embedding = variable_scope.get_variable('embedding', [num_encoder_symbols, embedding_size],trainable=embedding_trainable) embedded_inputs = embedding_ops.embedding_lookup(embedding, encoder_inputs) embedded_inputs = array_ops.unpack(embedded_inputs) query_embeddings = embedding_ops.embedding_lookup(embedding, query_inputs) query_embeddings = array_ops.unpack(query_embeddings) print ("Embedded Inputs length:", len(embedded_inputs)) print("Shape in embedded inputs:", embedded_inputs[0].get_shape()) with variable_scope.variable_scope("Encoder_Cell"): encoder_outputs, encoder_state_fw, encoder_state_bw = rnn.bidirectional_rnn( cell_encoder_fw, cell_encoder_bw, embedded_inputs, dtype=dtype) with variable_scope.variable_scope("Query_Cell"): query_outputs, query_state_fw, query_state_bw = rnn.bidirectional_rnn( cell_encoder_fw, cell_encoder_bw, query_embeddings, dtype = dtype) # First calculate a concatenation of encoder outputs to put attention on. encoder_state = array_ops.concat(1, [encoder_state_fw, encoder_state_bw]) query_state = array_ops.concat(1, [query_state_fw, query_state_bw]) top_states_encoder = [array_ops.reshape(e, [-1, 1, 2*cell_encoder_fw.output_size]) for e in encoder_outputs] attention_states_encoder = array_ops.concat(1, top_states_encoder) top_states_query = [array_ops.reshape(e, [-1, 1, 2*cell_encoder_fw.output_size]) for e in query_outputs] attention_states_query = array_ops.concat(1, top_states_query) # Decoder. output_size = None if output_projection is None: cell_encoder_fw = rnn_cell.OutputProjectionWrapper(cell_encoder_fw, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return dynamic_decoder_wrapper( decoder_inputs, initial_state=encoder_state, attention_state=attention_states_encoder, attention_states_query = attention_states_query, cell_encoder = cell_encoder_fw, num_symbols = num_decoder_symbols, embedding_size = embedding_size, distract_initial_state = encoder_state, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, embedding_scope = scope, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state, query_weights, doc_weights = dynamic_decoder_wrapper( decoder_inputs, initial_state=encoder_state, attention_states=attention_states_encoder, attention_states_query = attention_states_query, cell_encoder = cell_encoder_fw, num_symbols=num_decoder_symbols, embedding_size = embedding_size, distract_initial_state = encoder_state, distraction_cell = distraction_cell, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, embedding_scope = scope, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) print (len(outputs), len(state_list), len(query_weights), len(doc_weights)) return outputs + state_list + query_weights + doc_weights outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. query_len = len(decoder_inputs) doc_len = len(decoder_inputs) state_list = outputs_and_state[outputs_len:-(query_len + doc_len)] m = len(state_list) print (outputs_len, query_len, doc_len, m, len(outputs_and_state)) state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state, outputs_and_state[outputs_len + m: outputs_len + m + query_len], outputs_and_state[outputs_len + m + query_len:]
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False, mc_search=False): with variable_scope.variable_scope( scope or "embedding_attention_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. # 改:encoder_cell = tf.contrib.rnn.EmbeddingWrapper( # cell, embedding_classes=num_encoder_symbols, # embedding_size=embedding_size) encoder_cell = core_rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) # 改:encoder_outputs, encoder_state =tf.contrib.rnn.static_rnn( # encoder_cell, encoder_inputs, dtype=dtype) encoder_outputs, encoder_state =tf.nn.static_rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] print(top_states) attention_states = array_ops.concat (top_states,1) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention, mc_search=mc_search, scope=scope) return outputs, state, encoder_state # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention, mc_search=mc_search, scope=scope) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state, encoder_state
def _linear(args, output_size, bias, scope=None, use_fp16=False): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] dtype = [a.dtype for a in args][0] # Now the computation. with tf.variable_scope(scope or "Linear"): matrix = _variable_on_cpu('Matrix', [total_arg_size, output_size], use_fp16=use_fp16) if use_fp16: dtype = tf.float16 else: dtype = tf.float32 args = [tf.cast(x, dtype) for x in args] if len(args) == 1: res = tf.matmul(args[0], matrix, transpose_a=False, transpose_b=True) else: res = tf.matmul(tf.concat(args, 1), matrix, transpose_a=False, transpose_b=True) if not bias: return res bias_term = _variable_on_cpu('Bias', [output_size], tf.constant_initializer(0), use_fp16=use_fp16) return res + bias_term
def _as_tuple(value): if not nest.is_sequence(value): return value return tuple([_as_tuple(v) for v in value])
def state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None): """RNN that accepts a state saver for time-truncated RNN calculation. Args: cell: An instance of `RNNCell`. inputs: A length T list of inputs, each a tensor of shape `[batch_size, input_size]`. state_saver: A state saver object with methods `state` and `save_state`. state_name: Python string or tuple of strings. The name to use with the state_saver. If the cell returns tuples of states (i.e., `cell.state_size` is a tuple) then `state_name` should be a tuple of strings having the same length as `cell.state_size`. Otherwise it should be a single string. sequence_length: (optional) An int32/int64 vector size [batch_size]. See the documentation for rnn() for more details about sequence_length. scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, state) where: outputs is a length T list of outputs (one for each input) states is the final state Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If `inputs` is `None` or an empty list, or if the arity and type of `state_name` does not match that of `cell.state_size`. """ state_size = cell.state_size state_is_tuple = nest.is_sequence(state_size) state_name_tuple = nest.is_sequence(state_name) if state_is_tuple != state_name_tuple: raise ValueError( "state_name should be the same type as cell.state_size. " "state_name: %s, cell.state_size: %s" % (str(state_name), str(state_size))) if state_is_tuple: state_name_flat = nest.flatten(state_name) state_size_flat = nest.flatten(state_size) if len(state_name_flat) != len(state_size_flat): raise ValueError("#elems(state_name) != #elems(state_size): %d vs. %d" % (len(state_name_flat), len(state_size_flat))) initial_state = nest.pack_sequence_as( structure=state_name, flat_sequence=[state_saver.state(n) for n in state_name_flat]) else: initial_state = state_saver.state(state_name) (outputs, state) = rnn(cell, inputs, initial_state=initial_state, sequence_length=sequence_length, scope=scope) if state_is_tuple: state_flat = nest.flatten(state) save_state = [ state_saver.save_state(n, s) for (n, s) in zip(state_name_flat, state_flat)] else: save_state = [state_saver.save_state(state_name, state)] with ops.control_dependencies(save_state): outputs[-1] = array_ops.identity(outputs[-1]) return (outputs, state)
def _linear(args, output_size, bias, bias_initializer=None, kernel_initializer=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_initializer: starting value to initialize the bias (default is all zeros). kernel_initializer: starting value to initialize the weight. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype, initializer=kernel_initializer) if len(args) == 1: res = math_ops.matmul(args[0], weights) else: res = math_ops.matmul(array_ops.concat(args, 1), weights) if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def _maxlens(x, d=0): n = len(x) ret = [(n, d)] if n > 0 and nest.is_sequence(x[0]): ret.extend(map(lambda y: _maxlens(y, d + 1), x)) return nest.flatten(ret)
def _rnn_step( time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell, state_size, skip_conditionals=False): """Calculate one step of a dynamic RNN minibatch. Returns an (output, state) pair conditioned on the sequence_lengths. When skip_conditionals=False, the pseudocode is something like: if t >= max_sequence_length: return (zero_output, state) if t < min_sequence_length: return call_cell() # Selectively output zeros or output, old state or new state depending # on if we've finished calculating each row. new_output, new_state = call_cell() final_output = np.vstack([ zero_output if time >= sequence_lengths[r] else new_output_r for r, new_output_r in enumerate(new_output) ]) final_state = np.vstack([ state[r] if time >= sequence_lengths[r] else new_state_r for r, new_state_r in enumerate(new_state) ]) return (final_output, final_state) Args: time: Python int, the current time step sequence_length: int32 `Tensor` vector of size [batch_size] min_sequence_length: int32 `Tensor` scalar, min of sequence_length max_sequence_length: int32 `Tensor` scalar, max of sequence_length zero_output: `Tensor` vector of shape [output_size] state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`, or a list/tuple of such tensors. call_cell: lambda returning tuple of (new_output, new_state) where new_output is a `Tensor` matrix of shape `[batch_size, output_size]`. new_state is a `Tensor` matrix of shape `[batch_size, state_size]`. state_size: The `cell.state_size` associated with the state. skip_conditionals: Python bool, whether to skip using the conditional calculations. This is useful for `dynamic_rnn`, where the input tensor matches `max_sequence_length`, and using conditionals just slows everything down. Returns: A tuple of (`final_output`, `final_state`) as given by the pseudocode above: final_output is a `Tensor` matrix of shape [batch_size, output_size] final_state is either a single `Tensor` matrix, or a tuple of such matrices (matching length and shapes of input `state`). Raises: ValueError: If the cell returns a state tuple whose length does not match that returned by `state_size`. """ state_is_tuple = nest.is_sequence(state) # Convert state to a list for ease of use state = list(nest.flatten(state)) if state_is_tuple else [state] state_shape = [s.get_shape() for s in state] def _copy_some_through(new_output, new_state): # Use broadcasting select to determine which values should get # the previous state & zero output, and which values should get # a calculated state & output. copy_cond = (time >= sequence_length) return ([math_ops.select(copy_cond, zero_output, new_output)] + [math_ops.select(copy_cond, old_s, new_s) for (old_s, new_s) in zip(state, new_state)]) def _maybe_copy_some_through(): """Run RNN step. Pass through either no or some past state.""" new_output, new_state = call_cell() new_state = ( list(nest.flatten(new_state)) if state_is_tuple else [new_state]) if len(state) != len(new_state): raise ValueError( "Input and output state tuple lengths do not match: %d vs. %d" % (len(state), len(new_state))) return control_flow_ops.cond( # if t < min_seq_len: calculate and return everything time < min_sequence_length, lambda: [new_output] + new_state, # else copy some of it through lambda: _copy_some_through(new_output, new_state)) # TODO(ebrevdo): skipping these conditionals may cause a slowdown, # but benefits from removing cond() and its gradient. We should # profile with and without this switch here. if skip_conditionals: # Instead of using conditionals, perform the selective copy at all time # steps. This is faster when max_seq_len is equal to the number of unrolls # (which is typical for dynamic_rnn). new_output, new_state = call_cell() new_state = ( list(nest.flatten(new_state)) if state_is_tuple else [new_state]) if len(state) != len(new_state): raise ValueError( "Input and output state tuple lengths do not match: %d vs. %d" % (len(state), len(new_state))) final_output_and_state = _copy_some_through(new_output, new_state) else: empty_update = lambda: [zero_output] + list(state) final_output_and_state = control_flow_ops.cond( # if t >= max_seq_len: copy all state through, output zeros time >= max_sequence_length, empty_update, # otherwise calculation is required: copy some or all of it through _maybe_copy_some_through) (final_output, final_state) = ( final_output_and_state[0], final_output_and_state[1:]) final_output.set_shape(zero_output.get_shape()) for final_state_i, state_shape_i in zip(final_state, state_shape): final_state_i.set_shape(state_shape_i) if state_is_tuple: return ( final_output, nest.pack_sequence_as(structure=state_size, flat_sequence=final_state)) else: return (final_output, final_state[0])
def _line_sep(self, args, output_size, bias, bias_initializer=None, kernel_initializer=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to \ be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: [x, h] = args x_size = x.get_shape().as_list()[1] W_xh = tf.get_variable('W_xh', [x_size, output_size], initializer=weights_initializer) W_hh = tf.get_variable('W_hh', [int(output_size / 4), output_size], initializer=weights_initializer) #x = tf.Print(x,[tf.reduce_mean(x)], str(scope)+'x: ') #h = tf.Print(h,[tf.reduce_mean(h)], str(scope)+'h: ') #W_xh = tf.Print(W_xh,[tf.reduce_mean(W_xh)], str(scope)+'W_xh: ') #W_hh = tf.Print(W_hh,[tf.reduce_mean(W_hh)], str(scope)+'W_hh: ') cn_xh = self.cosine_norm(x, W_xh, 'cn_xh') # one hot vector cn_hh = self.cosine_norm(h, W_hh, 'cn_hh') #cn_xh = tf.Print(cn_xh,[tf.reduce_mean(cn_xh)], str(scope)+'cn_xh: ') #cn_hh = tf.Print(cn_hh,[tf.reduce_mean(cn_hh)], str(scope)+'cn_hh: ') res = cn_xh + cn_hh if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer( 0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def BiRNNModel(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, num_cell_layers=None, scope=None): """Creates a bidirectional recurrent neural network. Similar to the unidirectional case above (rnn) but takes input and builds independent forward and backward RNNs with the final forward and backward outputs depth-concatenated, such that the output will have the format [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: A length T list of inputs, each a tensor of shape [batch_size, input_size]. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape `[batch_size x cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell_fw.state_size`. initial_state_bw: (optional) Same as for `initial_state_fw`, but using the corresponding properties of `cell_bw`. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. sequence_length: (optional) An int32/int64 vector, size `[batch_size]`, containing the actual lengths for each of the sequences. num_cell_layers: Num of layers of the RNN cell. (Mainly used for generating output state representations for multi-layer RNN cells.) scope: VariableScope for the created subgraph; defaults to "BiRNN" Returns: A tuple (outputs, output_states) where: outputs is a length `T` list of outputs (one for each input), which are depth-concatenated forward and backward outputs. output_states is a length `T` list of hidden states (one for each step), which are depth-concatenated forward and backward states. Raises: TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`. ValueError: If inputs is None or an empty list. """ if not isinstance(cell_fw, tf.nn.rnn_cell.RNNCell): raise TypeError("cell_fw must be an instance of RNNCell") if not isinstance(cell_bw, tf.nn.rnn_cell.RNNCell): raise TypeError("cell_bw must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") name = scope or "BiRNN" # Forward direction with tf.variable_scope(name + "_FW") as fw_scope: output_fw, states_fw = RNNModel(cell_fw, inputs, initial_state_fw, dtype, sequence_length, num_cell_layers=num_cell_layers, scope=fw_scope) # Backward direction with tf.variable_scope(name + "_BW") as bw_scope: tmp, tmp_states = RNNModel(cell_bw, _reverse_seq(inputs, sequence_length), initial_state_bw, dtype, sequence_length, num_cell_layers=num_cell_layers, scope=bw_scope) output_bw = _reverse_seq(tmp, sequence_length) states_bw = _reverse_seq(tmp_states, sequence_length) # Concat each of the forward/backward outputs outputs = [tf.concat(axis=1, values=[fw, bw]) for fw, bw in zip(output_fw, output_bw)] # Notice that the computation of the encoder final state uses the final state # of the backward RNN without reverse!!! if nest.is_sequence(cell_fw.state_size): output_states = [nest_map_dual(lambda x, y: tf.concat(axis=1, values=[x, y]), fw, bw) for fw, bw in zip(states_fw, tmp_states)] else: if num_cell_layers > 1: output_states = [] for fw, bw in zip(states_fw, tmp_states): output_states.append(tf.concat(axis=1, values=[tf.concat(axis=1, values=[l_fw, l_bw]) for l_fw, l_bw in zip(tf.split(axis=1, num_or_size_splits=num_cell_layers, value=fw), tf.split(axis=1, num_or_size_splits=num_cell_layers, value=bw))])) else: output_states = [tf.concat(axis=1, values=[fw, bw]) for fw, bw in zip(states_fw, tmp_states)] return (outputs, output_states)