def testWeightSpecificSparsity(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with variable_scope.variable_scope("layer1"): w1 = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w1) with variable_scope.variable_scope("layer2"): w2 = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w2) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = state_ops.assign_add(self.global_step, 1) with self.cached_session() as session: variables.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllEqual( session.run(pruning.get_weight_sparsity()), [0.5, 0.75])
def testFunctionCallInDifferentVariableScopes(self): @function.Defun(dtypes.float32) def Foo(inputs): var = variable_scope.get_variable( "var", shape=[10], dtype=dtypes.float32, initializer=init_ops.ones_initializer()) return inputs + var input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32) with variable_scope.variable_scope("vs1"): out1_op = Foo(input_op) with variable_scope.variable_scope("vs2"): out2_op = Foo(input_op) global_vars = variables.global_variables() self.assertEqual(len(global_vars), 1) self.assertEqual(global_vars[0].name, "vs1/var:0") with session.Session() as sess: sess.run(variables.global_variables_initializer()) out1, out2 = sess.run( [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)}) self.assertAllEqual(out1, np.linspace(2, 11, 10)) self.assertAllEqual(out2, np.linspace(2, 11, 10))
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = inputs.dtype bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) # pylint: disable=protected-access value = math_ops.sigmoid( rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) # pylint: enable=protected-access with vs.variable_scope("candidate"): # pylint: disable=protected-access with vs.variable_scope("input_projection"): hi = rnn_cell_impl._linear(inputs, self._num_units, True, self._bias_initializer, self._kernel_initializer) with vs.variable_scope("hidden_projection"): hh = r * (rnn_cell_impl._linear(state, self._num_units, True, self._bias_initializer, self._kernel_initializer)) # pylint: enable=protected-access c = self._activation(hi + hh) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): if self._dropMaskInput.get_shape()[1:] != inputs.get_shape()[1:]: print("error: "+str(self._dropMaskInput.get_shape()[1:])+" != "+str(inputs.get_shape()[1:])) assert(False) if self._dropMaskState.get_shape()[1:] != state.get_shape()[1:]: print("error: "+str(self._dropMaskState.get_shape()[1:])+" != "+str(state.get_shape()[1:])) assert(False) dropin = tf.mul(self._dropMaskInput, inputs) dropst = tf.mul(self._dropMaskState, state) with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concat = rnn_cell._linear([dropin, dropst], 2 * self._num_units, True, 1.0) r, u = tf.split(1, 2, concat) r, u = tf.sigmoid(r), tf.sigmoid(u) with vs.variable_scope("Candidate"): htilda = self._activation(rnn_cell._linear([dropin, r * dropst], self._num_units, True)) new_h = u * dropst + (1 - u) * htilda return new_h, new_h
def testInitFromCheckpoint(self): checkpoint_dir = self.get_temp_dir() with self.test_session() as session: v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir) # New graph and session. with ops.Graph().as_default() as g: with self.test_session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable("my1", [1, 10]) with variable_scope.variable_scope("some_other_scope"): my2 = variable_scope.get_variable("my2", [10, 10]) with variable_scope.variable_scope("other_useful_scope"): my4 = variable_scope.get_variable("var4", [9, 9]) my3 = variable_scope.get_variable("my3", [100, 100]) checkpoint_utils.init_from_checkpoint(checkpoint_dir, { "var1": "some_scope/my1", "useful_scope/": "some_scope/some_other_scope/other_useful_scope/", }) checkpoint_utils.init_from_checkpoint(checkpoint_dir, { "var2": "some_scope/some_other_scope/my2", "var3": my3, }) session.run(variables.global_variables_initializer()) self.assertAllEqual(my1.eval(session), v1) self.assertAllEqual(my2.eval(session), v2) self.assertAllEqual(my3.eval(session), v3) self.assertAllEqual(my4.eval(session), v4) # Check that tensors are not explicitly in the graph. self.assertLess(len(str(session.graph.as_graph_def())), 29000)
def testGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.175991, 0.175991]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros( [1, 3]) # Test GRUCell with input_size != num_units. m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], {x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test self.assertAllClose(res[0], [[0.156736, 0.156736]])
def __call__(self, inputs, state, scope=None): """Long short-term memory cell with attention (LSTMA).""" with vs.variable_scope(scope or type(self).__name__): if self._state_is_tuple: state, attns, attn_states = state else: states = state state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size]) attns = array_ops.slice( states, [0, self._cell.state_size], [-1, self._attn_size]) attn_states = array_ops.slice( states, [0, self._cell.state_size + self._attn_size], [-1, self._attn_size * self._attn_length]) attn_states = array_ops.reshape(attn_states, [-1, self._attn_length, self._attn_size]) input_size = self._input_size if input_size is None: input_size = inputs.get_shape().as_list()[1] inputs = _linear([inputs, attns], input_size, True) lstm_output, new_state = self._cell(inputs, state) if self._state_is_tuple: new_state_cat = array_ops.concat(1, _unpacked_state(new_state)) else: new_state_cat = new_state new_attns, new_attn_states = self._attention(new_state_cat, attn_states) with vs.variable_scope("AttnOutputProjection"): output = _linear([lstm_output, new_attns], self._attn_size, True) new_attn_states = array_ops.concat(1, [new_attn_states, array_ops.expand_dims(output, 1)]) new_attn_states = array_ops.reshape( new_attn_states, [-1, self._attn_length * self._attn_size]) new_state = (new_state, new_attns, new_attn_states) if not self._state_is_tuple: new_state = array_ops.concat(1, list(new_state)) return output, new_state
def __call__(self, inputs, state, scope=None): """Run this RNN cell on inputs, starting from the given state. Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: if `self.state_size` is an integer, this should be a `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise, if `self.state_size` is a tuple of integers, this should be a tuple with shapes `[batch_size, s] for s in self.state_size`. scope: VariableScope for the created subgraph; defaults to class name. Returns: A pair containing: - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. - New state: Either a single `2-D` tensor, or a tuple of tensors matching the arity and shapes of `state`. """ if scope is not None: with vs.variable_scope(scope, custom_getter=self._rnn_get_variable) as scope: return super(RNNCell, self).__call__(inputs, state, scope=scope) else: scope_attrname = "rnncell_scope" scope = getattr(self, scope_attrname, None) if scope is None: scope = vs.variable_scope(vs.get_variable_scope(), custom_getter=self._rnn_get_variable) setattr(self, scope_attrname, scope) with scope: return super(RNNCell, self).__call__(inputs, state)
def testReuse(self): def f(x): return core_layers.dense(x, self.CHANNELS // 2) def g(x): return core_layers.dense(x, self.CHANNELS // 2) x = random_ops.random_uniform( [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32) x1, x2 = array_ops.split(x, 2, axis=-1) with variable_scope.variable_scope("test"): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_before = len(variables.global_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after) loss = math_ops.reduce_mean(y1 + y2) _ = gradients_impl.gradients(loss, [x] + variables.trainable_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after)
def reduce_to_final(images, num_filters_out, nhidden=None, scope=None): """Reduce an image to a final state by running two LSTMs. Args: images: (num_images, height, width, depth) tensor num_filters_out: output layer depth nhidden: hidden layer depth (defaults to num_filters_out) scope: optional scope name Returns: A (num_images, num_filters_out) batch. """ with variable_scope.variable_scope(scope, "ReduceToFinal", [images]): nhidden = nhidden or num_filters_out batch_size, height, width, depth = _shape(images) transposed = array_ops.transpose(images, [1, 0, 2, 3]) reshaped = array_ops.reshape(transposed, [height, batch_size * width, depth]) with variable_scope.variable_scope("reduce1"): reduced = lstm1d.sequence_to_final(reshaped, nhidden) transposed_hidden = array_ops.reshape(reduced, [batch_size, width, nhidden]) hidden = array_ops.transpose(transposed_hidden, [1, 0, 2]) with variable_scope.variable_scope("reduce2"): output = lstm1d.sequence_to_final(hidden, num_filters_out) return output
def call(self, inputs, state, att_score=None): """Gated recurrent unit (GRU) with nunits cells.""" if self._gate_linear is None: bias_ones = self._bias_initializer if self._bias_initializer is None: bias_ones = init_ops.constant_initializer( 1.0, dtype=inputs.dtype) with vs.variable_scope("gates"): # Reset gate and update gate. self._gate_linear = _Linear( [inputs, state], 2 * self._num_units, True, bias_initializer=bias_ones, kernel_initializer=self._kernel_initializer) value = math_ops.sigmoid(self._gate_linear([inputs, state])) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r_state = r * state if self._candidate_linear is None: with vs.variable_scope("candidate"): self._candidate_linear = _Linear( [inputs, r_state], self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) c = self._activation(self._candidate_linear([inputs, r_state])) u = (1.0 - att_score) * u new_h = u * state + (1 - u) * c return new_h, new_h
def _set_scope_for_nonnetwork_sublayer(self, sublayer): if sublayer._scope is None: if sublayer._first_parent is None: constituent_first_parent = None else: constituent_first_parent = sublayer._first_parent() if constituent_first_parent: constituent_first_parent._set_scope() parent_scope = constituent_first_parent._scope else: self._finalize_name(False) raise ValueError( ("The parent of a Layer added to Network %s was garbage collected " "before the Layer was built. If this limitation bothers you " "please file a feature request.") % (self.name,)) with variable_scope.variable_scope(parent_scope): # Horrid hack to make Layer variable names which are direct # sub-layers of Networks conform to the Network variable naming # conventions. with variable_scope.variable_scope( None, use_resource=True, default_name=sublayer.name) as sub_scope: sublayer._scope = sub_scope # Also switch op naming for this Layer to match Network conventions, # i.e. op naming matching variable naming. sublayer._name_scope_name = _network_name_scope_naming
def separable_lstm(images, num_filters_out, kernel_size=None, nhidden=None, scope=None): """Run bidirectional LSTMs first horizontally then vertically. Args: images: (num_images, height, width, depth) tensor num_filters_out: output layer depth kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of of the pooling. Can be an int if both values are the same. Set to None for not using blocks nhidden: hidden layer depth scope: optional scope name Returns: (num_images, height/kernel_height, width/kernel_width, num_filters_out) tensor """ with variable_scope.variable_scope(scope, "SeparableLstm", [images]): if nhidden is None: nhidden = num_filters_out if kernel_size is not None: images = get_blocks(images, kernel_size) hidden = horizontal_lstm(images, nhidden) with variable_scope.variable_scope("vertical"): transposed = array_ops.transpose(hidden, [0, 2, 1, 3]) output_transposed = horizontal_lstm(transposed, num_filters_out) output = array_ops.transpose(output_transposed, [0, 2, 1, 3]) return output
def _serving_ops(self, features): """Add ops for serving to the graph.""" with variable_scope.variable_scope("model", use_resource=True): filtering_features = {} prediction_features = {} values_length = array_ops.shape( features[feature_keys.FilteringFeatures.VALUES])[1] for key, value in features.items(): if key == feature_keys.State.STATE_TUPLE: # Ignore state input. The model's default start state is replicated # across the batch. continue if key == feature_keys.FilteringFeatures.VALUES: filtering_features[key] = value else: filtering_features[key] = value[:, :values_length] prediction_features[key] = value[:, values_length:] cold_filtering_outputs = self.model.define_loss( features=filtering_features, mode=estimator_lib.ModeKeys.EVAL) prediction_features[feature_keys.State.STATE_TUPLE] = ( cold_filtering_outputs.end_state) with variable_scope.variable_scope("model", reuse=True): prediction_outputs = self.model.predict( features=prediction_features) return estimator_lib.EstimatorSpec( mode=estimator_lib.ModeKeys.PREDICT, export_outputs={ feature_keys.SavedModelLabels.PREDICT: _NoStatePredictOutput(prediction_outputs), }, # Likely unused, but it is necessary to return `predictions` to satisfy # the Estimator's error checking. predictions={})
def __call__(self, *args, **kwargs): if self._variable_scope: if self._variables_created: # This is not the first visit to __call__, so variables have already # been created, and we want to reuse them. with variable_scope.variable_scope(self._variable_scope, reuse=variable_scope.AUTO_REUSE): with self._eager_variable_store.as_default(): return self._call_func(args, kwargs, check_for_new_variables=True) else: # This is the first visit to __call__, but the scope has already been # created in the constructor. Set _variables_created after the inner # function is successfully called so that subsequent calls take the if # branch above. with variable_scope.variable_scope(self._variable_scope, reuse=variable_scope.AUTO_REUSE): with self._eager_variable_store.as_default(): result = self._call_func(args, kwargs, check_for_new_variables=False) self._variables_created = True return result else: # The scope was not created at construction time, so create it here. # Subsequent calls should reuse variables. with variable_scope.variable_scope( self._unique_name, self._name, custom_getter=self._custom_getter) as vs: self._variable_scope = vs with self._eager_variable_store.as_default(): result = self._call_func(args, kwargs, check_for_new_variables=False) self._variables_created = True return result
def testAllowsReuseWithoutPartitioner(self): with variable_scope.variable_scope( "scope0", partitioner=axis0_into2_partitioner): v = variable_scope.get_variable("name0", shape=(3, 1, 1)) with variable_scope.variable_scope("scope0", reuse=True): v_reused = variable_scope.get_variable("name0") self.assertEqual(v, v_reused)
def _serving_ops(self, features): """Add ops for serving to the graph.""" with variable_scope.variable_scope("model", use_resource=True): prediction_outputs = self.model.predict(features=features) with variable_scope.variable_scope("model", reuse=True): filtering_outputs = self.create_loss( features, estimator_lib.ModeKeys.EVAL) with variable_scope.variable_scope("model", reuse=True): no_state_features = { k: v for k, v in features.items() if not k.startswith(feature_keys.State.STATE_PREFIX)} # Ignore any state management when cold-starting. The model's default # start state is replicated across the batch. cold_filtering_outputs = self.model.define_loss( features=no_state_features, mode=estimator_lib.ModeKeys.EVAL) return estimator_lib.EstimatorSpec( mode=estimator_lib.ModeKeys.PREDICT, export_outputs={ feature_keys.SavedModelLabels.PREDICT: export_lib.PredictOutput(prediction_outputs), feature_keys.SavedModelLabels.FILTER: export_lib.PredictOutput( state_to_dictionary(filtering_outputs.end_state)), feature_keys.SavedModelLabels.COLD_START_FILTER: _NoStatePredictOutput( state_to_dictionary(cold_filtering_outputs.end_state)) }, # Likely unused, but it is necessary to return `predictions` to satisfy # the Estimator's error checking. predictions={})
def testVarOpScope(self): with self.test_session(): with ops.name_scope("scope1"): with variable_scope.variable_scope("tower", "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "tower/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower/scope2/") with variable_scope.variable_scope("tower", "default", []): with self.assertRaises(ValueError): variable_scope.get_variable("w", []) with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope1/tower_1/scope2/") with ops.name_scope("scope2"): with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "default/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default/scope2/") with variable_scope.variable_scope(None, "default", []): self.assertEqual( variable_scope.get_variable("w", []).name, "default_1/w:0") with ops.name_scope("scope2") as sc2: self.assertEqual(sc2, "scope2/default_1/scope2/")
def testGetCollection(self): with self.test_session(): a = variable_scope.get_variable("a", []) b = variable_scope.get_variable("b", [], trainable=False) with variable_scope.variable_scope("foo_") as scope1: a = variable_scope.get_variable("a", []) b = variable_scope.get_variable("b", [], trainable=False) self.assertEqual([ v.name for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) ], ["foo_/a:0"]) self.assertEqual([ v.name for v in scope1.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ], ["foo_/a:0", "foo_/b:0"]) with variable_scope.variable_scope("foo") as scope2: a = variable_scope.get_variable("a", []) b = variable_scope.get_variable("b", [], trainable=False) self.assertEqual([ v.name for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) ], ["foo/a:0"]) self.assertEqual([ v.name for v in scope2.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ], ["foo/a:0", "foo/b:0"]) scope = variable_scope.get_variable_scope() self.assertEqual([ v.name for v in scope.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) ], ["a:0", "b:0", "foo_/a:0", "foo_/b:0", "foo/a:0", "foo/b:0"]) self.assertEqual([ v.name for v in scope.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) ], ["a:0", "foo_/a:0", "foo/a:0"])
def testBasicLSTMCell(self): for dtype in [dtypes.float16, dtypes.float32]: np_dtype = dtype.as_numpy_dtype with self.test_session(graph=ops.Graph()) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2], dtype=dtype) m = array_ops.zeros([1, 8], dtype=dtype) cell = rnn_cell_impl.MultiRNNCell( [ rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) self.assertEqual(cell.dtype, None) g, out_m = cell(x, m) # Layer infers the input type. self.assertEqual(cell.dtype, dtype.name) expected_variable_names = [ "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] self.assertEqual(expected_variable_names, [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, out_m], { x.name: np.array([[1., 1.]]), m.name: 0.1 * np.ones([1, 8]) }) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a # smoke test. self.assertAllClose(res[0], np.array( [[0.240, 0.240]], dtype=np_dtype), 1e-2) expected_mem = np.array( [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]], dtype=np_dtype) self.assertAllClose(res[1], expected_mem, 1e-2) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test BasicLSTMCell with input_size != num_units. x = array_ops.zeros([1, 3], dtype=dtype) m = array_ops.zeros([1, 4], dtype=dtype) g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, out_m], { x.name: np.array([[1., 1., 1.]], dtype=np_dtype), m.name: 0.1 * np.ones([1, 4], dtype=np_dtype) }) self.assertEqual(len(res), 2)
def testBlockGRUToGRUCellSingleStep(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def testCustomGradientErrorsWithNonResourceVariables(self): def F(x, use_resource=False): with variable_scope.variable_scope("f", use_resource=use_resource): out = core_layers.dense(x, 4, use_bias=False) def Grad(out_grad, variables=None): # pylint: disable=redefined-outer-name del out_grad self.assertEqual(1, len(variables)) return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))]) return out, Grad @custom_gradient.custom_gradient def FResource(x): return F(x, use_resource=True) @custom_gradient.custom_gradient def FNonResource(x): return F(x, use_resource=False) x = array_ops.ones((3, 2)) + 2. # Wrapping scope has use_resource=True but inner scope sets to False. Fails. with variable_scope.variable_scope("vs1", use_resource=True): with self.assertRaisesWithPredicateMatch(TypeError, "must be `ResourceVariable`s"): FNonResource(x) # Wrapping scope has use_resource=False but inner scope sets to True. # Passes. with variable_scope.variable_scope("vs2", use_resource=False): FResource(x)
def __call__(self, *args, **kwargs): # In both branches below, the template store is installed as default after # the variable scope is opened in order to ensure that templates nested at # the same level correctly uniquify lower variable scope names. if self._variable_scope: # Create a cache for the variable scope context manager the first time # around so that we don't have to keep recreating it. if not self._variable_scope_context_manager: self._variable_scope_context_manager = variable_scope.variable_scope( self._variable_scope, reuse=variable_scope.AUTO_REUSE) with self._variable_scope_context_manager: with self._template_store.as_default(): result = self._call_func(args, kwargs) return result else: # The scope was not created at construction time, so create it here. # Subsequent calls should reuse variables. with variable_scope.variable_scope( self._unique_name, self._name, custom_getter=self._custom_getter) as vs: self._variable_scope = vs # Because the scope was not created at construction time, the template # store's variable scope name is unset; set it here. self._template_store.set_variable_scope_name(vs.name) with self._template_store.as_default(): result = self._call_func(args, kwargs) return result
def dnn_logits_fn(): """Builds the logits from the input layer.""" previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) return dnn_logits
def call(self, inputs, state): """ """ (c_prev, m_prev) = state self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0] scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): x = array_ops.concat([inputs, m_prev], axis=1) with vs.variable_scope("first_gemm"): if self._linear1 is None: # no bias for bottleneck self._linear1 = _Linear(x, self._fact_size, False) R_fact = self._linear1(x) with vs.variable_scope("second_gemm"): if self._linear2 is None: self._linear2 = _Linear(R_fact, 4*self._num_units, True) R = self._linear2(R_fact) i, j, f, o = array_ops.split(R, 4, 1) c = (math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid(i) * math_ops.tanh(j)) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): if self._linear3 is None: self._linear3 = _Linear(m, self._num_proj, False) m = self._linear3(m) new_state = rnn_cell_impl.LSTMStateTuple(c, m) return m, new_state
def testIndyGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.185265, 0.17704]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): # Test IndyGRUCell with input_size != num_units. x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 2]) g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.155127, 0.157328]])
def __call__(self, inputs, state, scope=None): """Run this multi-layer cell on inputs, starting from state.""" with vs.variable_scope(scope or "multi_rnn_cell"): cur_state_pos = 0 cur_inp = inputs new_states = [] outputs = [] for i, cell in enumerate(self._cells): with vs.variable_scope("cell_%d" % i): if self._state_is_tuple: if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) cur_state = state[i] else: cur_state = array_ops.slice( state, [0, cur_state_pos], [-1, cell.state_size]) cur_state_pos += cell.state_size cur_inp, new_state = cell(cur_inp, cur_state) outputs.append(cur_inp) new_states.append(new_state) new_states = (tuple(new_states) if self._state_is_tuple else array_ops.concat_v2(new_states, 1)) return tuple(outputs), new_states
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None): """Run a softmax layer over all the time steps of an input sequence. Args: inputs: (length, batch_size, depth) tensor noutput: output depth scope: optional scope name name: optional name for output tensor linear_name: name for linear (pre-softmax) output Returns: A tensor of size (length, batch_size, noutput). """ length, _, ninputs = _shape(inputs) inputs_u = array_ops.unstack(inputs) output_u = [] with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]): initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1) initial_b = constant_op.constant(0.1, shape=[noutput]) w = variables.model_variable("weights", initializer=initial_w) b = variables.model_variable("biases", initializer=initial_b) for i in xrange(length): with variable_scope.variable_scope(scope, "SequenceSoftmaxStep", [inputs_u[i]]): # TODO(tmb) consider using slim.fully_connected(..., # activation_fn=tf.nn.softmax) linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name) output = nn_ops.softmax(linear) output_u += [output] outputs = array_ops.stack(output_u, name=name) return outputs
def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None): """Creates fully connected deep neural network subgraph. Args: tensor_in: tensor or placeholder for input features. hidden_units: list of counts of hidden units in each layer. activation: activation function between layers. Can be None. dropout: if not None, will add a dropout layer with given probability. Returns: A tensor which would be a deep neural network. """ with vs.variable_scope('dnn'): for i, n_units in enumerate(hidden_units): with vs.variable_scope('layer%d' % i): # Weight initializer was set to None to replicate the behavior of # rnn_cell.linear. Using fully_connected's default initializer gets # slightly worse quality results on unit tests. tensor_in = layers.legacy_fully_connected( tensor_in, n_units, weight_init=None, weight_collections=['dnn_weights'], bias_collections=['dnn_biases']) if activation is not None: tensor_in = activation(tensor_in) if dropout is not None: is_training = array_ops_.squeeze(ops.get_collection('IS_TRAINING')) tensor_in = control_flow_ops.cond( is_training, lambda: dropout_ops.dropout(tensor_in, prob=(1.0 - dropout)), lambda: tensor_in) return tensor_in
def dnn_autoencoder( tensor_in, hidden_units, activation=nn.relu, add_noise=None, dropout=None, scope=None): """Creates fully connected autoencoder subgraph. Args: tensor_in: tensor or placeholder for input features. hidden_units: list of counts of hidden units in each layer. activation: activation function used to map inner latent layer onto reconstruction layer. add_noise: a function that adds noise to tensor_in, e.g. def add_noise(x): return(x + np.random.normal(0, 0.1, (len(x), len(x[0])))) dropout: if not None, will add a dropout layer with given probability. scope: the variable scope for this op. Returns: Tensors for encoder and decoder. """ with vs.variable_op_scope([tensor_in], scope, "autoencoder"): if add_noise is not None: tensor_in = add_noise(tensor_in) with vs.variable_scope("encoder"): # build DNN encoder encoder = dnn_ops.dnn( tensor_in, hidden_units, activation=activation, dropout=dropout) with vs.variable_scope("decoder"): # reverse hidden_units and built DNN decoder decoder = dnn_ops.dnn( encoder, hidden_units[::-1], activation=activation, dropout=dropout) return encoder, decoder
def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, initial_state_fw=None, initial_state_bw=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): """Creates a dynamic version of bidirectional recurrent neural network. Similar to the unidirectional case above (rnn) but takes input and builds independent forward and backward RNNs. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: The RNN inputs. If time_major == False (default), this must be a tensor of shape: `[batch_size, max_time, input_size]`. If time_major == True, this must be a tensor of shape: `[max_time, batch_size, input_size]`. [batch_size, input_size]. sequence_length: An int32/int64 vector, size `[batch_size]`, containing the actual lengths for each of the sequences. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape `[batch_size, cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell_fw.state_size`. initial_state_bw: (optional) Same as for `initial_state_fw`, but using the corresponding properties of `cell_bw`. dtype: (optional) The data type for the initial states and expected output. Required if initial_states are not provided or RNN states have a heterogeneous dtype. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. scope: VariableScope for the created subgraph; defaults to "bidirectional_rnn" Returns: A tuple (outputs, output_states) where: outputs: A tuple (output_fw, output_bw) containing the forward and the backward rnn output `Tensor`. If time_major == False (default), output_fw will be a `Tensor` shaped: `[batch_size, max_time, cell_fw.output_size]` and output_bw will be a `Tensor` shaped: `[batch_size, max_time, cell_bw.output_size]`. If time_major == True, output_fw will be a `Tensor` shaped: `[max_time, batch_size, cell_fw.output_size]` and output_bw will be a `Tensor` shaped: `[max_time, batch_size, cell_bw.output_size]`. It returns a tuple instead of a single concatenated `Tensor`, unlike in the `bidirectional_rnn`. If the concatenated one is preferred, the forward and backward outputs can be concatenated as `tf.concat_v2(outputs, 2)`. output_states: A tuple (output_state_fw, output_state_bw) containing the forward and the backward final states of bidirectional rnn. Raises: TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`. """ # pylint: disable=protected-access if not isinstance(cell_fw, rnn_cell_impl._RNNCell): raise TypeError("cell_fw must be an instance of RNNCell") if not isinstance(cell_bw, rnn_cell_impl._RNNCell): raise TypeError("cell_bw must be an instance of RNNCell") # pylint: enable=protected-access with vs.variable_scope(scope or "bidirectional_rnn"): # Forward direction with vs.variable_scope("fw") as fw_scope: output_fw, output_state_fw = dynamic_rnn( cell=cell_fw, inputs=inputs, sequence_length=sequence_length, initial_state=initial_state_fw, dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, time_major=time_major, scope=fw_scope) # Backward direction if not time_major: time_dim = 1 batch_dim = 0 else: time_dim = 0 batch_dim = 1 with vs.variable_scope("bw") as bw_scope: inputs_reverse = array_ops.reverse_sequence( input=inputs, seq_lengths=sequence_length, seq_dim=time_dim, batch_dim=batch_dim) tmp, output_state_bw = dynamic_rnn( cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length, initial_state=initial_state_bw, dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, time_major=time_major, scope=bw_scope) output_bw = array_ops.reverse_sequence(input=tmp, seq_lengths=sequence_length, seq_dim=time_dim, batch_dim=batch_dim) outputs = (output_fw, output_bw) output_states = (output_state_fw, output_state_bw) return (outputs, output_states)
def encode(self, inputs, sequence_lengths, masks=None, encoder_state_input=None): """ In a generalized encode function, you pass in your inputs, masks, and an initial hidden state input into this function. :param inputs: Symbolic representations of your input : param passage_sequence_lengths: This is the sequence length for each passage in the batch. They're all the same and correspond to max_length_passage :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate through masked steps :param encoder_state_input: (Optional) pass this as initial hidden state to tf.nn.dynamic_rnn to build conditional representations :return: an encoded representation of your input. It can be context-level representation, word-level representation, or both. """ # Inputs is tuple # Inputs = (passages_batch, questions_batch) passages, questions = inputs # Sequence lengths is tuple # Sequence_lengths = (passage_lengths, question_lengths) passage_sequence_lengths, question_sequence_lengths = sequence_lengths # We assume passages_batch is (None, max_length_passage, embedding dim) and represents the word embedding of the passages # We assume questions_batch is (None, max_length_question, embedding dim) and represents the word embedding of the questions # Each index in the second dimension represents the word at that index # TODO: add mask if we want to use the final state. step-by step is probably chill # See: https://piazza.com/class/iw9g8b9yxp46s8?cid=2153 # Our model is the following: # run bid-rectional LSTM over the passage. Concatenate forward and backward vectors at each word/time-step # run bid-rectional LSTM over the question. Concatenate forward and backward vectors for the last word # for each time-step in passage, concatenate state vector with the vector above # Generate bi-lstm for passage with vs.variable_scope("Passage-Bi-LSTM"): # First pass, we just want to run a bi lateral LSTM over each passage in the batch # Create forward direction cell with vs.variable_scope('forward'): p_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.size, forget_bias=1.0, state_is_tuple=True) # Create backward cell with vs.variable_scope('backward'): p_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.size, forget_bias=1.0, state_is_tuple=True) # Create bilateral LSTM p_outputs, p_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=p_lstm_fw_cell, cell_bw = p_lstm_bw_cell, \ inputs = passages, dtype=tf.float64, scope="Passage-Bi-LSTM", sequence_length=passage_sequence_lengths) # Concatenate the output_fw and output_bw at each time-step for each input in batch # Outputs[0] corresponds to the forward output state at each time step # Outputs[1] corresponds to the backward otuput state at each time step p_concat_outputs = tf.concat(2, [p_outputs[0], p_outputs[1]]) # Generate bi-lstm for question with vs.variable_scope("question-Bi-LSTM"): # First pass, we just want to run a bi lateral LSTM over each question in the batch # Create forward direction cell with vs.variable_scope('forward'): q_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.size, forget_bias=1.0, state_is_tuple=True) # Create backward cell with vs.variable_scope('backward'): q_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.size, forget_bias=1.0, state_is_tuple=True) # Create bilateral LSTM q_outputs, q_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=q_lstm_fw_cell, cell_bw = q_lstm_bw_cell, \ inputs = questions, dtype=tf.float64, scope="question-Bi-LSTM", sequence_length=question_sequence_lengths) # Only concat the forward state for the last time step and backward state for first time step # Outputs[0] corresponds to the forward output state at each time step # Outputs[1] corresponds to the backward otuput state at each time step final_word_question = tf.concat( 1, [q_outputs[0][:, -1, :], q_outputs[1][:, 0, :]]) # For each word/time-step, we now concatenate with the bi-lstm representation of the last word in the associated question # TODO: double check this is what we want to do # First, we need to expand the dimension of final_word_question i.e. add a dimension in the middle for each time step final_word_question = tf.expand_dims(final_word_question, 1) # Now we multiple the middle dimension for each word in the passage passage_length = passages.get_shape()[1] max_passage_len = tf.shape(passages)[1] final_word_question = tf.tile(final_word_question, multiples=[1, max_passage_len, 1]) # Now we concatenate. We want each vector for each word/time-step to get the same vector concatenated final_concat = tf.concat(2, [p_concat_outputs, final_word_question]) # We return the concatenated bidirectional LSTM output for each word in the passage i.e. each time step # Should return (batch_size, max_length_passage, 4*hidden_size) (assuming all hidden sizes same) return final_concat
def decode(self, knowledge_rep, sequence_lengths): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: a Tensor of size (batch_size, max_length_passage, knowledge_size) :return: """ # Basic Prediction Layer # override output_size for now, since this is only the softmax layer # We assume knowledge_rep is (batch_size, max_length_passage, XXx) # We convert to (batch_size, max_length_passage, 2) where we output probabilities for being in the answer or not self.output_size = 2 outputs = [] passage_sequence_lengths, question_sequence_lengths = sequence_lengths # Run Knowledge rep through bi-directional LSTM with tf.variable_scope("Decode-Bi-LSTM"): batch_size, max_length, knowledge_size = knowledge_rep.get_shape( ).as_list() # Create forward cell with vs.variable_scope('forward'): d_lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(knowledge_size, forget_bias=1.0, state_is_tuple=True) # Create backward cell with vs.variable_scope('backward'): d_lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(knowledge_size, forget_bias=1.0, state_is_tuple=True) # Create bi-directional LSTM d_outputs, d_output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=d_lstm_fw_cell, cell_bw = d_lstm_bw_cell, \ inputs=knowledge_rep, dtype=tf.float64, scope="Decode-Bi-LSTM", sequence_length=passage_sequence_lengths) d_outputs_concat = tf.concat(2, [d_outputs[0], d_outputs[1]]) # compute predictions as y' = sofmax(xU + b) with tf.variable_scope("Decode-Prediction"): # Create weight matrix U = tf.get_variable( "U", [knowledge_size * 2, self.output_size], dtype=tf.float64, initializer=tf.contrib.layers.xavier_initializer()) # Create bias vector b = tf.get_variable("b", [1, self.output_size], dtype=tf.float64, initializer=tf.constant_initializer(0.0)) max_len_passage = tf.shape(knowledge_rep)[1] # Since max_len_passage is dynamically computed, we cannot iterate over # every time step in the tensor d_outputs_concat in order to compute the prediction. # Instead, we reshape the tensor into a 2D matrix so that we can compute # predictions for all time steps with one matrix multiplication. # NOTE: Assuming tf.reshape unrolls dimensions in the exact opposite order # it rolls dimensions (initial tests on numpy seem to indicate this) d_outputs_reshaped = tf.reshape(d_outputs_concat, [-1, knowledge_size * 2]) outputs = tf.matmul(d_outputs_reshaped, U) + b # Our outputs are of size (batch_size*max_len_passage, output_size), we needed # to reshape them back so they are grouped by timestep outputs = tf.reshape(outputs, [-1, max_len_passage, self.output_size]) return outputs
def _line_sep(self, args, output_size, bias, bias_initializer=None, kernel_initializer=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError("linear expects shape[1] to \ be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = vs.get_variable_scope() with vs.variable_scope(scope) as outer_scope: [x, h] = args x_size = x.get_shape().as_list()[1] W_xh = tf.get_variable('W_xh', [x_size, output_size], initializer=weights_initializer) W_hh = tf.get_variable('W_hh', [int(output_size / 4), output_size], initializer=weights_initializer) #x = tf.Print(x,[tf.reduce_mean(x)], str(scope)+'x: ') #h = tf.Print(h,[tf.reduce_mean(h)], str(scope)+'h: ') #W_xh = tf.Print(W_xh,[tf.reduce_mean(W_xh)], str(scope)+'W_xh: ') #W_hh = tf.Print(W_hh,[tf.reduce_mean(W_hh)], str(scope)+'W_hh: ') cn_xh = self.cosine_norm(x, W_xh, 'cn_xh') # one hot vector cn_hh = self.cosine_norm(h, W_hh, 'cn_hh') #cn_xh = tf.Print(cn_xh,[tf.reduce_mean(cn_xh)], str(scope)+'cn_xh: ') #cn_hh = tf.Print(cn_hh,[tf.reduce_mean(cn_hh)], str(scope)+'cn_hh: ') res = cn_xh + cn_hh if not bias: return res with vs.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) if bias_initializer is None: bias_initializer = init_ops.constant_initializer( 0.0, dtype=dtype) biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size], dtype=dtype, initializer=bias_initializer) return nn_ops.bias_add(res, biases)
def testAddVariable(self): obj = NonLayerTrackable() with self.assertRaisesRegex(ValueError, "do not specify shape"): trackable_utils.add_variable(obj, name="shape_specified_twice", shape=[], initializer=1) constant_initializer = trackable_utils.add_variable( obj, name="constant_initializer", initializer=1) with variable_scope.variable_scope("some_variable_scope"): ones_initializer = trackable_utils.add_variable( obj, name="ones_initializer", shape=[2], initializer=init_ops.ones_initializer(dtype=dtypes.float32)) bare_initializer = trackable_utils.add_variable( obj, name="bare_initializer", shape=[2, 2], dtype=dtypes.float64, initializer=init_ops.zeros_initializer) # Even in graph mode, there are no naming conflicts between objects, only # naming conflicts within an object. other_duplicate = resource_variable_ops.ResourceVariable( name="duplicate", initial_value=1.) duplicate = trackable_utils.add_variable(obj, name="duplicate", shape=[]) with self.assertRaisesRegex(ValueError, "'duplicate'.*already declared"): trackable_utils.add_variable(obj, name="duplicate", shape=[]) self.evaluate(trackable_utils.gather_initializers(obj)) self.assertEqual("constant_initializer:0", constant_initializer.name) self.assertEqual(1, self.evaluate(constant_initializer)) self.assertEqual("some_variable_scope/ones_initializer:0", ones_initializer.name) self.assertAllEqual([1, 1], self.evaluate(ones_initializer)) self.assertAllEqual([[0., 0.], [0., 0.]], self.evaluate(bare_initializer)) self.assertEqual("a_variable:0", obj.a_variable.name) self.assertEqual("duplicate:0", other_duplicate.name) if context.executing_eagerly(): # When executing eagerly, there's no uniquification of variable names. The # checkpoint name will be the same. self.assertEqual("duplicate:0", duplicate.name) else: # The .name attribute may be globally influenced, but the checkpoint name # won't be (tested below). self.assertEqual("duplicate_1:0", duplicate.name) named_variables, _, _ = ( graph_view.ObjectGraphView(obj).serialize_object_graph()) expected_checkpoint_names = ( "a_variable/.ATTRIBUTES/VARIABLE_VALUE", "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE", "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE", "duplicate/.ATTRIBUTES/VARIABLE_VALUE", "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE", ) six.assertCountEqual(self, expected_checkpoint_names, [v.name for v in named_variables])
def raw_rnn(cell, loop_fn, parallel_iterations=None, swap_memory=False, scope=None): """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`. **NOTE: This method is still in testing, and the API may change.** This function is a more primitive version of `dynamic_rnn` that provides more direct access to the inputs each iteration. It also provides more control over when to start and finish reading the sequence, and what to emit for the output. For example, it can be used to implement the dynamic decoder of a seq2seq model. Instead of working with `Tensor` objects, most operations work with `TensorArray` objects directly. The operation of `raw_rnn`, in pseudo-code, is basically the following: ```python time = tf.constant(0, dtype=tf.int32) (finished, next_input, initial_state, _, loop_state) = loop_fn( time=time, cell_output=None, cell_state=None, loop_state=None) emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype) state = initial_state while not all(finished): (output, cell_state) = cell(next_input, state) (next_finished, next_input, next_state, emit, loop_state) = loop_fn( time=time + 1, cell_output=output, cell_state=cell_state, loop_state=loop_state) # Emit zeros and copy forward state for minibatch entries that are finished. state = tf.where(finished, state, next_state) emit = tf.where(finished, tf.zeros_like(emit), emit) emit_ta = emit_ta.write(time, emit) # If any new minibatch entries are marked as finished, mark these. finished = tf.logical_or(finished, next_finished) time += 1 return (emit_ta, state, loop_state) ``` with the additional properties that output and state may be (possibly nested) tuples, as determined by `cell.output_size` and `cell.state_size`, and as a result the final `state` and `emit_ta` may themselves be tuples. A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this: ```python inputs = tf.placeholder(shape=(max_time, batch_size, input_depth), dtype=tf.float32) sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32) inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) inputs_ta = inputs_ta.unstack(inputs) cell = tf.contrib.rnn.LSTMCell(num_units) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(batch_size, tf.float32) else: next_cell_state = cell_state elements_finished = (time >= sequence_length) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32), lambda: inputs_ta.read(time)) next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) outputs_ta, final_state, _ = raw_rnn(cell, loop_fn) outputs = outputs_ta.stack() ``` Args: cell: An instance of RNNCell. loop_fn: A callable that takes inputs `(time, cell_output, cell_state, loop_state)` and returns the tuple `(finished, next_input, next_cell_state, emit_output, next_loop_state)`. Here `time` is an int32 scalar `Tensor`, `cell_output` is a `Tensor` or (possibly nested) tuple of tensors as determined by `cell.output_size`, and `cell_state` is a `Tensor` or (possibly nested) tuple of tensors, as determined by the `loop_fn` on its first call (and should match `cell.state_size`). The outputs are: `finished`, a boolean `Tensor` of shape `[batch_size]`, `next_input`: the next input to feed to `cell`, `next_cell_state`: the next state to feed to `cell`, and `emit_output`: the output to store for this iteration. Note that `emit_output` should be a `Tensor` or (possibly nested) tuple of tensors with shapes and structure matching `cell.output_size` and `cell_output` above. The parameter `cell_state` and output `next_cell_state` may be either a single or (possibly nested) tuple of tensors. The parameter `loop_state` and output `next_loop_state` may be either a single or (possibly nested) tuple of `Tensor` and `TensorArray` objects. This last parameter may be ignored by `loop_fn` and the return value may be `None`. If it is not `None`, then the `loop_state` will be propagated through the RNN loop, for use purely by `loop_fn` to keep track of its own state. The `next_loop_state` parameter returned may be `None`. The first call to `loop_fn` will be `time = 0`, `cell_output = None`, `cell_state = None`, and `loop_state = None`. For this call: The `next_cell_state` value should be the value with which to initialize the cell's state. It may be a final state from a previous RNN or it may be the output of `cell.zero_state()`. It should be a (possibly nested) tuple structure of tensors. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of appropriate type and shape `[batch_size] + cell.state_size`. If `cell.state_size` is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. The `emit_output` value may be either `None` or a (possibly nested) tuple structure of tensors, e.g., `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`. If this first `emit_output` return value is `None`, then the `emit_ta` result of `raw_rnn` will have the same structure and dtypes as `cell.output_size`. Otherwise `emit_ta` will have the same structure, shapes (prepended with a `batch_size` dimension), and dtypes as `emit_output`. The actual values returned for `emit_output` at this initializing call are ignored. Note, this emit structure must be consistent across all time steps. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A tuple `(emit_ta, final_state, final_loop_state)` where: `emit_ta`: The RNN output `TensorArray`. If `loop_fn` returns a (possibly nested) set of Tensors for `emit_output` during initialization, (inputs `time = 0`, `cell_output = None`, and `loop_state = None`), then `emit_ta` will have the same structure, dtypes, and shapes as `emit_output` instead. If `loop_fn` returns `emit_output = None` during this call, the structure of `cell.output_size` is used: If `cell.output_size` is a (possibly nested) tuple of integers or `TensorShape` objects, then `emit_ta` will be a tuple having the same structure as `cell.output_size`, containing TensorArrays whose elements' shapes correspond to the shape data in `cell.output_size`. `final_state`: The final cell state. If `cell.state_size` is an int, this will be shaped `[batch_size, cell.state_size]`. If it is a `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. If it is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. `final_loop_state`: The final loop state as returned by `loop_fn`. Raises: TypeError: If `cell` is not an instance of RNNCell, or `loop_fn` is not a `callable`. """ # pylint: disable=protected-access if not isinstance(cell, rnn_cell_impl._RNNCell): raise TypeError("cell must be an instance of RNNCell") # pylint: enable=protected-access if not callable(loop_fn): raise TypeError("loop_fn must be a callable") parallel_iterations = parallel_iterations or 32 # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) time = constant_op.constant(0, dtype=dtypes.int32) (elements_finished, next_input, initial_state, emit_structure, init_loop_state) = loop_fn( time, None, None, None) # time, cell_output, cell_state, loop_state flat_input = nest.flatten(next_input) # Need a surrogate loop state for the while_loop if none is available. loop_state = (init_loop_state if init_loop_state is not None else constant_op.constant(0, dtype=dtypes.int32)) input_shape = [input_.get_shape() for input_ in flat_input] static_batch_size = input_shape[0][0] for input_shape_i in input_shape: # Static verification that batch sizes all match static_batch_size.merge_with(input_shape_i[0]) batch_size = static_batch_size.value if batch_size is None: batch_size = array_ops.shape(flat_input[0])[0] nest.assert_same_structure(initial_state, cell.state_size) state = initial_state flat_state = nest.flatten(state) flat_state = [ops.convert_to_tensor(s) for s in flat_state] state = nest.pack_sequence_as(structure=state, flat_sequence=flat_state) if emit_structure is not None: flat_emit_structure = nest.flatten(emit_structure) flat_emit_size = [emit.get_shape() for emit in flat_emit_structure] flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure] else: emit_structure = cell.output_size flat_emit_size = nest.flatten(emit_structure) flat_emit_dtypes = [flat_state[0].dtype] * len(flat_emit_size) flat_emit_ta = [ tensor_array_ops.TensorArray(dtype=dtype_i, dynamic_size=True, size=0, name="rnn_output_%d" % i) for i, dtype_i in enumerate(flat_emit_dtypes) ] emit_ta = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_emit_ta) flat_zero_emit = [ array_ops.zeros( _state_size_with_prefix(size_i, prefix=[batch_size]), dtype_i) for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes) ] zero_emit = nest.pack_sequence_as(structure=emit_structure, flat_sequence=flat_zero_emit) def condition(unused_time, elements_finished, *_): return math_ops.logical_not(math_ops.reduce_all(elements_finished)) def body(time, elements_finished, current_input, emit_ta, state, loop_state): """Internal while loop body for raw_rnn. Args: time: time scalar. elements_finished: batch-size vector. current_input: possibly nested tuple of input tensors. emit_ta: possibly nested tuple of output TensorArrays. state: possibly nested tuple of state tensors. loop_state: possibly nested tuple of loop state tensors. Returns: Tuple having the same size as Args but with updated values. """ (next_output, cell_state) = cell(current_input, state) nest.assert_same_structure(state, cell_state) nest.assert_same_structure(cell.output_size, next_output) next_time = time + 1 (next_finished, next_input, next_state, emit_output, next_loop_state) = loop_fn(next_time, next_output, cell_state, loop_state) nest.assert_same_structure(state, next_state) nest.assert_same_structure(current_input, next_input) nest.assert_same_structure(emit_ta, emit_output) # If loop_fn returns None for next_loop_state, just reuse the # previous one. loop_state = loop_state if next_loop_state is None else next_loop_state def _copy_some_through(current, candidate): """Copy some tensors through via array_ops.where.""" current_flat = nest.flatten(current) candidate_flat = nest.flatten(candidate) # pylint: disable=g-long-lambda,cell-var-from-loop result_flat = [ _on_device(lambda: array_ops.where(elements_finished, current_i, candidate_i), device=candidate_i.op.device) for (current_i, candidate_i) in zip(current_flat, candidate_flat) ] # pylint: enable=g-long-lambda,cell-var-from-loop return nest.pack_sequence_as(structure=current, flat_sequence=result_flat) emit_output = _copy_some_through(zero_emit, emit_output) next_state = _copy_some_through(state, next_state) emit_output_flat = nest.flatten(emit_output) emit_ta_flat = nest.flatten(emit_ta) elements_finished = math_ops.logical_or(elements_finished, next_finished) emit_ta_flat = [ ta.write(time, emit) for (ta, emit) in zip(emit_ta_flat, emit_output_flat) ] emit_ta = nest.pack_sequence_as(structure=emit_structure, flat_sequence=emit_ta_flat) return (next_time, elements_finished, next_input, emit_ta, next_state, loop_state) returned = control_flow_ops.while_loop( condition, body, loop_vars=[ time, elements_finished, next_input, emit_ta, state, loop_state ], parallel_iterations=parallel_iterations, swap_memory=swap_memory) (emit_ta, final_state, final_loop_state) = returned[-3:] if init_loop_state is None: final_loop_state = None return (emit_ta, final_state, final_loop_state)
def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. This function is functionally identical to the function `rnn` above, but performs fully dynamic unrolling of `inputs`. Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for each frame. Instead, `inputs` may be a single `Tensor` where the maximum time is either the first or second dimension (see the parameter `time_major`). Alternatively, it may be a (possibly nested) tuple of Tensors, each of them having matching batch and time dimensions. The corresponding output is either a single `Tensor` having the same number of time steps and batch size, or a (possibly nested) tuple of such tensors, matching the nested structure of `cell.output_size`. The parameter `sequence_length` is optional and is used to copy-through state and zero-out outputs when past a batch element's sequence length. So it's more for correctness than performance, unlike in rnn(). Args: cell: An instance of RNNCell. inputs: The RNN inputs. If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`, or a nested tuple of such elements. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`, or a nested tuple of such elements. This may also be a (possibly nested) tuple of Tensors satisfying this property. The first two dimensions must match across all the inputs, but otherwise the ranks and other shape components may differ. In this case, input to `cell` at each time-step will replicate the structure of these tuples, except for the time dimension (from which the time is taken). The input to `cell` at each time step will be a `Tensor` or (possibly nested) tuple of Tensors each with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: outputs: The RNN output `Tensor`. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. Note, if `cell.output_size` is a (possibly nested) tuple of integers or `TensorShape` objects, then `outputs` will be a tuple having the same structure as `cell.output_size`, containing Tensors having shapes corresponding to the shape data in `cell.output_size`. state: The final state. If `cell.state_size` is an int, this will be shaped `[batch_size, cell.state_size]`. If it is a `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. If it is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ # pylint: disable=protected-access if not isinstance(cell, rnn_cell_impl._RNNCell): raise TypeError("cell must be an instance of RNNCell") # pylint: enable=protected-access # By default, time_major==False and inputs are batch-major: shaped # [batch, time, depth] # For internal calculations, we transpose to [time, batch, depth] flat_input = nest.flatten(inputs) if not time_major: # (B,T,D) => (T,B,D) flat_input = tuple( array_ops.transpose(input_, [1, 0, 2]) for input_ in flat_input) parallel_iterations = parallel_iterations or 32 if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length.get_shape().ndims not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size, " "but saw shape: %s" % sequence_length.get_shape()) sequence_length = array_ops.identity( # Just to find it in the graph. sequence_length, name="sequence_length") # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) input_shape = tuple(array_ops.shape(input_) for input_ in flat_input) batch_size = input_shape[0][1] for input_ in input_shape: if input_[1].get_shape() != batch_size.get_shape(): raise ValueError("All inputs should have the same batch size") if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( "If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) def _assert_has_shape(x, shape): x_shape = array_ops.shape(x) packed_shape = array_ops.stack(shape) return control_flow_ops.Assert( math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [ "Expected shape for Tensor %s is " % x.name, packed_shape, " but saw shape: ", x_shape ]) if sequence_length is not None: # Perform some shape validation with ops.control_dependencies( [_assert_has_shape(sequence_length, [batch_size])]): sequence_length = array_ops.identity(sequence_length, name="CheckSeqLen") inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input) (outputs, final_state) = _dynamic_rnn_loop( cell, inputs, state, parallel_iterations=parallel_iterations, swap_memory=swap_memory, sequence_length=sequence_length, dtype=dtype) # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth]. # If we are performing batch-major calculations, transpose output back # to shape [batch, time, depth] if not time_major: # (T,B,D) => (B,T,D) flat_output = nest.flatten(outputs) flat_output = [ array_ops.transpose(output, [1, 0, 2]) for output in flat_output ] outputs = nest.pack_sequence_as(structure=outputs, flat_sequence=flat_output) return (outputs, final_state)
def decode(self, knowledge_rep, masks, initial_state=(None, None)): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: it is a representation of the paragraph and question, decided by how you choose to implement the encoder :return: """ with vs.variable_scope("decoder"): #initial_state=(None,None) with vs.variable_scope("answer_start"): cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size) cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout) start_states, start_final_state = tf.nn.bidirectional_dynamic_rnn( cell, cell, knowledge_rep, sequence_length=masks, initial_state_fw=initial_state[0], initial_state_bw=initial_state[1], dtype=tf.float32) start_states = start_states[0] + start_states[1] #start_states, start_final_state = tf.nn.dynamic_rnn(cell, knowledge_rep, sequence_length=masks, dtype=tf.float32) start_states_reshaped = tf.reshape(start_states, [-1, self.hidden_size]) start_probs = tf.nn.rnn_cell._linear(start_states_reshaped, output_size=1, bias=True) start_probs = tf.reshape(start_probs, [-1, self.output_size]) with vs.variable_scope("answer_end"): cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size) cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout) end_states, end_final_state = tf.nn.bidirectional_dynamic_rnn( cell, cell, knowledge_rep, sequence_length=masks, initial_state_fw=start_final_state[0], initial_state_bw=start_final_state[1], dtype=tf.float32) end_states = end_states[0] + end_states[1] #end_states, end_final_state = tf.nn.dynamic_rnn(cell, knowledge_rep,initial_state=start_final_state, sequence_length=masks, dtype=tf.float32) end_states_reshaped = tf.reshape(end_states, [-1, self.hidden_size]) end_probs = tf.nn.rnn_cell._linear(end_states_reshaped, output_size=1, bias=True) end_probs = tf.reshape(end_probs, [-1, self.output_size]) # Masking bool_masks = tf.cast( tf.sequence_mask(masks, maxlen=self.output_size), tf.float32) add_mask = (-1e30 * (1.0 - bool_masks)) #add_mask = tf.log(bool_masks) start_probs = tf.add(start_probs, add_mask) end_probs = tf.add(end_probs, add_mask) return start_probs, end_probs
def _deepfm_model_fn(features, labels, mode, head, fm_first_feature_columns=None, fm_second_feature_columns=None, embedding_size=None, field_size=None, linear_optimizer='Ftrl', dnn_feature_columns=None, dnn_optimizer='Adagrad', dnn_hidden_units=None, dnn_activation_fn=nn.relu, dnn_dropout=None, input_layer_partitioner=None, config=None): """DNN and FM combined model_fn. Args: features: dict of `Tensor`. labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. head: A `Head` instance. fm_first_feature_columns: An iterable containing order-1 feature columns used by the fm model. fm_second_feature_columns: An iterable containing order-2 feature columns used by the fm model. embedding_size: input field vectors can be of different sizes, their embeddings are of the same size. field_size: The number of order-2 feature columns. linear_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the FM model. Defaults to the Ftrl optimizer. dnn_feature_columns: An iterable containing all the feature columns used by the DNN model. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN model. Defaults to the Adagrad optimizer. dnn_hidden_units: List of hidden units per DNN layer. dnn_activation_fn: Activation function applied to each DNN layer. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability we will drop out a given DNN coordinate. input_layer_partitioner: Partitioner for input layer. config: `RunConfig` object to configure the runtime settings. Returns: An `EstimatorSpec` instance. Raises: ValueError: If both `fm_first_feature_columns` and `fm_second_feature_columns` and `dnn_features_columns` are em pty at the same time, or `input_layer_partitioner` is missing, or features has the wrong type. """ if not isinstance(features, dict): raise ValueError('features should be a dictionary of `Tensor`s. ' 'Given type: {}'.format(type(features))) if not fm_first_feature_columns and not dnn_feature_columns and not fm_second_feature_columns: raise ValueError( 'Either fm_first_feature_columns or dnn_feature_columns or fm_second_feature_columns must be defined.' ) num_ps_replicas = config.num_ps_replicas if config else 0 input_layer_partitioner = input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # Build DNN Logits. dnn_parent_scope = 'dnn' if not dnn_feature_columns: dnn_logits = None else: dnn_optimizer = optimizers.get_optimizer_instance( dnn_optimizer, learning_rate=_DNN_LEARNING_RATE) _check_no_sync_replicas_optimizer(dnn_optimizer) if not dnn_hidden_units: raise ValueError( 'dnn_hidden_units must be defined when dnn_feature_columns is ' 'specified.') dnn_partitioner = (partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) with variable_scope.variable_scope(dnn_parent_scope, values=tuple( six.itervalues(features)), partitioner=dnn_partitioner): dnn_logit_fn = dnn._dnn_logit_fn_builder( units=head.logits_dimension, hidden_units=dnn_hidden_units, feature_columns=dnn_feature_columns, activation_fn=dnn_activation_fn, dropout=dnn_dropout, input_layer_partitioner=input_layer_partitioner) dnn_logits = dnn_logit_fn(features=features, mode=mode) # Build FM Logits. fm_parent_scope = 'fm' def cal_fm_first_logits(): logit_fn = linear._linear_logit_fn_builder( units=head.logits_dimension, feature_columns=fm_first_feature_columns) fm_first_logits = logit_fn(features=features) _add_layer_summary(fm_first_logits, scope.name) return fm_first_logits def cal_fm_second_logits(): embeddings = tf.feature_column.input_layer( features=features, feature_columns=fm_second_feature_columns) embeddings = tf.reshape(embeddings, shape=[-1, field_size, embedding_size]) sum_square = tf.square(tf.reduce_sum(embeddings, 1)) square_sum = tf.reduce_sum(tf.square(embeddings), 1) fm_second_logits = 0.5 * tf.reduce_sum( tf.subtract(sum_square, square_sum), 1, keep_dims=True) _add_layer_summary(fm_second_logits, scope.name) return fm_second_logits if not fm_first_feature_columns and not fm_second_feature_columns: fm_first_logits = None fm_second_logits = None else: linear_optimizer = optimizers.get_optimizer_instance( linear_optimizer, learning_rate=_fm_learning_rate( len(fm_first_feature_columns) + len(fm_second_feature_columns))) _check_no_sync_replicas_optimizer(linear_optimizer) with variable_scope.variable_scope( fm_parent_scope, values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as scope: if not fm_first_feature_columns: fm_first_logits = None fm_second_logits = cal_fm_second_logits() elif not fm_second_feature_columns: fm_second_logits = None fm_first_logits = cal_fm_first_logits() else: fm_first_logits = cal_fm_first_logits() fm_second_logits = cal_fm_second_logits() def add_logits(logits, to_add_logits): if logits is None: return to_add_logits else: return logits + to_add_logits if to_add_logits is not None else logits # Combine logits and build full model. logits = None logits = add_logits(logits, dnn_logits) logits = add_logits(logits, fm_second_logits) logits = add_logits(logits, fm_first_logits) def _train_op_fn(loss): """Returns the op to optimize the loss.""" train_ops = [] global_step = training_util.get_global_step() if dnn_logits is not None: train_ops.append( dnn_optimizer.minimize(loss, var_list=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope))) if fm_first_logits is not None or fm_second_logits is not None: train_ops.append( linear_optimizer.minimize( loss, var_list=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=fm_parent_scope))) train_op = control_flow_ops.group(*train_ops) with ops.control_dependencies([train_op]): return distribute_lib.increment_var(global_step) return head.create_estimator_spec(features=features, mode=mode, labels=labels, train_op_fn=_train_op_fn, logits=logits)
def _create_definition_if_needed_impl(self): """This is not what you want, see _create_definition_if_needed.""" if self._definition is not None or self._c_func is not None: return # Create the func_def object. temp_graph = _FuncGraph(capture_by_value=self._capture_by_value) with temp_graph.as_default(): # List of placeholders for the function_def. inputs = [] for (argname, argtype) in self._args: argholder = array_ops.placeholder(argtype, name=argname) inputs.append(argholder) # Call func and gather the output tensors. with vs.variable_scope("", custom_getter=temp_graph.getvar): outputs = self._func(*inputs) # There is no way of distinguishing between a function not returning # anything and a function returning None in Python. # We need to allow the former and ideally want to forbid the latter as # it is most likely user error. # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to # allow users to explicitly mark the function as not returning anything. # For now, we allow a single None return and interpret it as a function # with no output. if outputs is None: outputs = [] else: # If func only returned one value, make it a tuple. if not isinstance(outputs, (list, tuple)): outputs = (outputs, ) if any([_ is None for _ in outputs]): raise ValueError("Function can not return None.") # Ensures each output is a Tensor in the function graph. outputs = [ops.convert_to_tensor(t) for t in outputs] outputs = [ temp_graph.capture(t) if t.graph is not temp_graph else t for t in outputs ] self._extra_inputs = temp_graph.extra_inputs inputs.extend(temp_graph.extra_args) # pylint: disable=protected-access self._sub_functions = temp_graph._functions # pylint: enable=protected-access # Extra kwargs are treated as attrs on the function def. if self._func_name: base_func_name = self._func_name else: base_func_name = _get_func_name(self._func) if self._grad_func: base_func_name += ("_%s" % self._grad_func.name) kwargs_attr = _parse_kwargs_as_attrs(base_func_name, **self._extra_kwargs) if not temp_graph._c_graph: # pylint: disable=protected-access # Build the FunctionDef self._definition = graph_to_function_def.graph_to_function_def( temp_graph, temp_graph.get_operations(), inputs, outputs, out_names=self._out_names) for k in kwargs_attr: self._definition.attr[k].CopyFrom(kwargs_attr[k]) # Hash the definition and its dependencies. self._hash_str = self._create_hash_str( self._definition.signature.input_arg, self._definition.signature.output_arg, self._definition.node_def) # Finally, we decide the function name to use. If not specified, # make up something which is almost certainly unique (but deterministic). if not self._func_name: self._func_name = "_".join([base_func_name, self._hash_str]) self._definition.signature.name = self._func_name if self._func.__doc__: self._definition.signature.description = self._func.__doc__ self._op_def = self._definition.signature else: # C API is enabled output_names = ([compat.as_bytes(x) for x in self._out_names] if self._out_names else []) description = self._func.__doc__ or None # pylint: disable=protected-access c_func = c_api.TF_GraphToFunction_wrapper( temp_graph._c_graph, base_func_name, self._func_name is None, # append_hash_to_fn_name None, # opers [t._as_tf_output() for t in inputs], [t._as_tf_output() for t in outputs], output_names, None, # opts description) self._c_func = c_api_util.ScopedTFFunction(c_func) # pylint: enable=protected-access self._set_c_attrs(kwargs_attr) # Set cached fields: _op_def and _func_name (if not already set) self._op_def = self.definition.signature if self._func_name: assert self._func_name == self._op_def.name else: self._func_name = compat.as_str(self._op_def.name)
def pointer_decoder(decoder_inputs, initial_state, attention_states, ori_encoder_inputs, cell, feed_prev=False, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in range(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.stack(ori_encoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum( inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = core_rnn_cell_impl._linear([inp, attns], cell.output_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, data_format='NHWC', trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. Args: inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. data_format: NHWC or NCHW data format. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype if data_format == 'NHWC': # norm_dim = tf.range(1, inputs_rank-1) norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] elif data_format == 'NCHW': # norm_dim = tf.range(2, inputs_rank) norm_dim = tf.range(1, 2) params_shape = (inputs_shape[1]) # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) if data_format == 'NHWC': outputs = tf.multiply(outputs, scale) elif data_format == 'NCHW': scale = tf.expand_dims(scale, axis=-1) scale = tf.expand_dims(scale, axis=-1) outputs = tf.multiply(outputs, scale) # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1)) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def streaming_tp_fp_arrays(num_gbboxes, tp, fp, scores, remove_zero_scores=True, metrics_collections=None, updates_collections=None, name=None): """Streaming computation of True and False Positive arrays. This metrics also keeps track of scores and number of grountruth objects. """ # Input dictionaries: dict outputs as streaming metrics. if isinstance(scores, dict) or isinstance(fp, dict): d_values = {} d_update_ops = {} for c in num_gbboxes.keys(): scope = 'streaming_tp_fp_%s' % c v, up = streaming_tp_fp_arrays(num_gbboxes[c], tp[c], fp[c], scores[c], remove_zero_scores, metrics_collections, updates_collections, name=scope) d_values[c] = v d_update_ops[c] = up return d_values, d_update_ops # Input Tensors... with variable_scope.variable_scope(name, 'streaming_tp_fp', [num_gbboxes, tp, fp, scores]): num_gbboxes = math_ops.to_int64(num_gbboxes) scores = math_ops.to_float(scores) stype = tf.bool tp = tf.cast(tp, stype) fp = tf.cast(fp, stype) # Reshape TP and FP tensors and clean away 0 class values. scores = tf.reshape(scores, [-1]) tp = tf.reshape(tp, [-1]) fp = tf.reshape(fp, [-1]) # Remove TP and FP both false. mask = tf.logical_or(tp, fp) if remove_zero_scores: rm_threshold = 1e-4 mask = tf.logical_and(mask, tf.greater(scores, rm_threshold)) scores = tf.boolean_mask(scores, mask) tp = tf.boolean_mask(tp, mask) fp = tf.boolean_mask(fp, mask) # Local variables accumlating information over batches. v_nobjects = _create_local('v_num_gbboxes', shape=[], dtype=tf.int64) v_ndetections = _create_local('v_num_detections', shape=[], dtype=tf.int32) v_scores = _create_local('v_scores', shape=[0, ]) v_tp = _create_local('v_tp', shape=[0, ], dtype=stype) v_fp = _create_local('v_fp', shape=[0, ], dtype=stype) # Update operations. nobjects_op = state_ops.assign_add(v_nobjects, tf.reduce_sum(num_gbboxes)) ndetections_op = state_ops.assign_add(v_ndetections, tf.size(scores, out_type=tf.int32)) scores_op = state_ops.assign(v_scores, tf.concat([v_scores, scores], axis=0), validate_shape=False) tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0), validate_shape=False) fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0), validate_shape=False) # Value and update ops. val = (v_nobjects, v_ndetections, v_tp, v_fp, v_scores) with ops.control_dependencies([nobjects_op, ndetections_op, scores_op, tp_op, fp_op]): update_op = (nobjects_op, ndetections_op, tp_op, fp_op, scores_op) if metrics_collections: ops.add_to_collections(metrics_collections, val) if updates_collections: ops.add_to_collections(updates_collections, update_op) return val, update_op
def resnet_v1(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope=None): """Generator for v1 ResNet models. This function generates a family of ResNet v1 models. See the resnet_v1_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether batch_norm layers are in training mode. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with variable_scope.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope( [layers.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with arg_scope([layers.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError( 'The output_stride needs to be a multiple of 4.' ) output_stride /= 4 net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = layers_lib.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) if num_classes is not None: net = layers.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict( end_points_collection) if num_classes is not None: end_points['predictions'] = layers_lib.softmax( net, scope='predictions') return net, end_points
def __init__(self, config, mode, forward_only, cell_mode=None, no_previous=False, max_cell_length=None): super(DBRNNModel, self).__init__(config, mode, cell_mode=cell_mode, no_previous=no_previous, max_cell_length=max_cell_length) self.cell_fw = self.cell self.cell_bw = self.cell output_projection_forward = None output_projection_backward = None softmax_loss_function_forward = None softmax_loss_function_backward = None # forward output brnn sampled output projection with vs.variable_scope('forward_output_linear'): # sampled softmax if self.num_samples and self.num_samples < self.num_output_symbols: w_forward = tf.get_variable("Forward_proj_w", [self.cell_units, self.num_output_symbols]) w_t_forward = tf.transpose(w_forward) b_forward = tf.get_variable("Forward_proj_b", [self.num_output_symbols]) output_projection_forward = (w_forward, b_forward) def sampled_loss_forward(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t_forward, tf.float32) local_b = tf.cast(b_forward, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=self.num_samples, num_classes=self.num_output_symbols) softmax_loss_function_forward = sampled_loss_forward # backward output brnn sampled output projection with vs.variable_scope('backward_output_linear'): # sampled softmax if self.num_samples and self.num_samples < self.num_output_symbols: w_backward = tf.get_variable("Backward_proj_w", [self.cell_units, self.num_output_symbols]) w_t_backward = tf.transpose(w_backward) b_backward = tf.get_variable("Backward_proj_b", [self.num_output_symbols]) output_projection_backward = (w_backward, b_backward) def sampled_loss_backward(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t_backward, tf.float32) local_b = tf.cast(b_backward, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=self.num_samples, num_classes=self.num_output_symbols) softmax_loss_function_backward = sampled_loss_backward with vs.variable_scope('Dependent_BRNN_Model'): # make sampled softmax output_projection = None softmax_loss_function = None if self.num_samples and self.num_samples < self.num_output_symbols: w = tf.get_variable("proj_w", [self.cell_fw.output_size+self.cell_bw.output_size, self.num_output_symbols]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.num_output_symbols]) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=self.num_samples, num_classes=self.num_output_symbols) softmax_loss_function = sampled_loss self.brnn_outputs, self.state = model_utils.dependent_brnn( self.inputs, self.cell_fw, self.cell_bw, num_input_symbols=self.num_input_symbols, num_output_symbols=self.num_output_symbols, embedding_size=self.embedding_size, output_projection_fw=output_projection_forward, output_projection_bw=output_projection_backward, not_shared=self.not_shared) self.losses_fw = model_utils.sequence_loss(self.brnn_outputs[0], self.targets, self.weights, softmax_loss_function=softmax_loss_function_forward) self.losses_bw = model_utils.sequence_loss(self.brnn_outputs[1], self.targets, self.weights, softmax_loss_function=softmax_loss_function_backward) # Combine the output self.outputs = [] for time_step in xrange(len(self.brnn_outputs[0])): with vs.variable_scope( vs.get_variable_scope(), reuse=True if time_step > 0 else None): self.outputs.append(model_utils.linear(array_ops.concat([self.brnn_outputs[0][time_step], self.brnn_outputs[1][time_step]], -1), self.num_output_symbols,scope='output_projection')) self.losses = model_utils.sequence_loss(self.outputs, self.targets, self.weights, softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. all_params = tf.compat.v1.trainable_variables() params_fw = [p for p in all_params if p.name.find('input_brnn')!=-1 or p.name.find('output_brnn/FW')!=-1] params_bw = [p for p in all_params if p.name.find('input_brnn')!=-1 or p.name.find('output_brnn/BW')!=-1] # shared, provides better performance params = all_params # not shared #params = [p for p in all_params if p not in params_fw and p not in params_bw] if not forward_only: self.gradient_norms = [] self.updates = [] #opt = tf.train.AdamOptimizer(self.learning_rate) opt = tf.compat.v1.train.AdagradOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) gradients_fw = tf.gradients(self.losses_fw, params_fw) gradients_bw = tf.gradients(self.losses_bw, params_bw) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) clipped_gradients_fw, norm_fw = tf.clip_by_global_norm(gradients_fw, self.max_gradient_norm) clipped_gradients_bw, norm_bw = tf.clip_by_global_norm(gradients_bw, self.max_gradient_norm) self.gradient_norms.append(norm) self.gradient_norms.append(norm_fw) self.gradient_norms.append(norm_bw) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params))) self.updates.append(opt.apply_gradients( zip(clipped_gradients_fw, params_fw))) self.updates.append(opt.apply_gradients( zip(clipped_gradients_bw, params_bw), global_step=self.global_step)) self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=self.max_checkpoints_to_keep)
def streaming_precision_recall_arrays(n_gbboxes, rclasses, rscores, tp_tensor, fp_tensor, remove_zero_labels=True, metrics_collections=None, updates_collections=None, name=None): """Streaming computation of precision / recall arrays. This metrics keeps tracks of boolean True positives and False positives arrays. """ with variable_scope.variable_scope(name, 'stream_precision_recall', [n_gbboxes, rclasses, tp_tensor, fp_tensor]): n_gbboxes = math_ops.to_int64(n_gbboxes) rclasses = math_ops.to_int64(rclasses) rscores = math_ops.to_float(rscores) stype = tf.int32 tp_tensor = tf.cast(tp_tensor, stype) fp_tensor = tf.cast(fp_tensor, stype) # Reshape TP and FP tensors and clean away 0 class values. rclasses = tf.reshape(rclasses, [-1]) rscores = tf.reshape(rscores, [-1]) tp_tensor = tf.reshape(tp_tensor, [-1]) fp_tensor = tf.reshape(fp_tensor, [-1]) if remove_zero_labels: mask = tf.greater(rclasses, 0) rclasses = tf.boolean_mask(rclasses, mask) rscores = tf.boolean_mask(rscores, mask) tp_tensor = tf.boolean_mask(tp_tensor, mask) fp_tensor = tf.boolean_mask(fp_tensor, mask) # Local variables accumlating information over batches. v_nobjects = _create_local('v_nobjects', shape=[], dtype=tf.int64) v_ndetections = _create_local('v_ndetections', shape=[], dtype=tf.int32) v_scores = _create_local('v_scores', shape=[0, ]) v_tp = _create_local('v_tp', shape=[0, ], dtype=stype) v_fp = _create_local('v_fp', shape=[0, ], dtype=stype) # Update operations. nobjects_op = state_ops.assign_add(v_nobjects, tf.reduce_sum(n_gbboxes)) ndetections_op = state_ops.assign_add(v_ndetections, tf.size(rscores, out_type=tf.int32)) scores_op = state_ops.assign(v_scores, tf.concat([v_scores, rscores], axis=0), validate_shape=False) tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp_tensor], axis=0), validate_shape=False) fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp_tensor], axis=0), validate_shape=False) # Precision and recall computations. # r = _precision_recall(nobjects_op, scores_op, tp_op, fp_op, 'value') r = _precision_recall(v_nobjects, v_ndetections, v_scores, v_tp, v_fp, 'value') with ops.control_dependencies([nobjects_op, ndetections_op, scores_op, tp_op, fp_op]): update_op = _precision_recall(nobjects_op, ndetections_op, scores_op, tp_op, fp_op, 'update_op') # update_op = tf.Print(update_op, # [tf.reduce_sum(tf.cast(mask, tf.int64)), # tf.reduce_sum(tf.cast(mask2, tf.int64)), # tf.reduce_min(rscores), # tf.reduce_sum(n_gbboxes)], # 'Metric: ') # Some debugging stuff! # update_op = tf.Print(update_op, # [tf.shape(tp_op), # tf.reduce_sum(tf.cast(tp_op, tf.int64), axis=0)], # 'TP and FP shape: ') # update_op[0] = tf.Print(update_op, # [nobjects_op], # '# Groundtruth bboxes: ') # update_op = tf.Print(update_op, # [update_op[0][0], # update_op[0][-1], # tf.reduce_min(update_op[0]), # tf.reduce_max(update_op[0]), # tf.reduce_min(update_op[1]), # tf.reduce_max(update_op[1])], # 'Precision and recall :') if metrics_collections: ops.add_to_collections(metrics_collections, r) if updates_collections: ops.add_to_collections(updates_collections, update_op) return r, update_op
def attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, num_heads=1, dtype=dtypes.float32, use_attention=True, loop_function=None, scope=None): if use_attention: print ('Use the attention RNN model') if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") with variable_scope.variable_scope(scope or "attention_RNN"): output_size = encoder_outputs[0].get_shape()[1].value top_states = [array_ops.reshape(e, [-1, 1, output_size]) for e in encoder_outputs] attention_states = array_ops.concat(top_states, 1) if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) batch_size = array_ops.shape(top_states[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # loop through the encoder_outputs attention_encoder_outputs = list() sequence_attention_weights = list() for i in xrange(len(encoder_outputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() if i == 0: with variable_scope.variable_scope("Initial_Decoder_Attention"): initial_state = rnn_cell_impl._linear(encoder_state, output_size, True) attn_weights, ds = attention(initial_state) else: attn_weights, ds = attention(encoder_outputs[i]) output = array_ops.concat([ds[0], encoder_outputs[i]], 1) # NOTE: here we temporarily assume num_head = 1 with variable_scope.variable_scope("AttnRnnOutputProjection"): logit = rnn_cell_impl._linear(output, num_decoder_symbols, True) attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1 sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1 else: print ('Use the NON attention RNN model') with variable_scope.variable_scope(scope or "non-attention_RNN"): attention_encoder_outputs = list() sequence_attention_weights = list() # copy over logits once out of sequence_length if encoder_outputs[0].get_shape().ndims != 1: (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) else: fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_outputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_logit = array_ops.zeros( array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) zero_logit.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(encoder_outputs): if time > 0: variable_scope.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop # call_cell = lambda: cell(input_, state) generate_logit = lambda: rnn_cell_impl._linear(encoder_outputs[time], num_decoder_symbols, True) # pylint: enable=cell-var-from-loop if sequence_length is not None: logit = _step( time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) else: logit = generate_logit attention_encoder_outputs.append(logit) return attention_encoder_outputs, sequence_attention_weights
def roll_attention_decoder(decoder_inputs, initial_state, encoder_states, enc_padding_mask, cell, initial_state_attention=False, pointer_gen=True): """ Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. encoder_states: 3D Tensor [batch_size x attn_length x attn_size]. enc_padding_mask: 2D Tensor [batch_size x attn_length] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1). cell: rnn_cell.RNNCell defining the cell function and size. initial_state_attention: Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step). pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step. Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x cell.output_size]. The output vectors. state: The final state of the decoder. A tensor shape [batch_size x cell.state_size]. attn_dists: A list containing tensors of shape (batch_size,attn_length). The attention distributions for each decoder step. p_gens: List of scalars. The values of p_gen for each decoder step. Empty list if pointer_gen=False. """ with variable_scope.variable_scope("attention_decoder") as scope: batch_size = encoder_states.get_shape( )[0].value # if this line fails, it's because the batch size isn't defined attn_size = encoder_states.get_shape( )[2].value # if this line fails, it's because the attention length isn't defined # Reshape encoder_states (need to insert a dim) encoder_states = tf.expand_dims( encoder_states, axis=2) # now is shape (batch_size, attn_len, 1, attn_size) # To calculate attention, we calculate # v^T tanh(W_h h_i + W_s s_t + b_attn) # where h_i is an encoder state, and s_t a decoder state. # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t). # We set it to be equal to the size of the encoder states. attention_vec_size = attn_size # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features W_h = variable_scope.get_variable( "W_h", [1, 1, attn_size, attention_vec_size]) encoder_features = nn_ops.conv2d( encoder_states, W_h, [1, 1, 1, 1], "SAME") # shape (batch_size,attn_length,1,attention_vec_size) # Get the weight vectors v and w_c (w_c is for coverage) v = variable_scope.get_variable("v", [attention_vec_size]) def attention(decoder_state): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e) # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist def run(ind): i = 0 inp = tf.gather(decoder_inputs, ind) ith_state = tf.unstack(tf.gather(initial_state, ind)) if initial_state_attention: # true in decode mode # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input context_vector, _ = attention(ith_state) context_vector = array_ops.zeros([batch_size, attn_size]) context_vector.set_shape([ None, attn_size ]) # Ensure the second shape of attention vectors is set. # tf.logging.info("Adding attention_decoder timestep %i of %i", i, len(decoder_inputs)) if i > 0: variable_scope.get_variable_scope().reuse_variables() # Merge input and previous attentions into one vector x of the same size as inp input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) x = linear([inp] + [context_vector], input_size, True) # Run the decoder RNN cell. cell_output = decoder state cell_output, state = cell(x, ith_state) # state_list.append(state) # Run the attention mechanism. #if initial_state_attention: # always true in decode mode with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True ): # you need this because you've already run the initial attention(...) call context_vector, attn_dist = attention(state) # Calculate p_gen if pointer_gen: with tf.variable_scope('calculate_pgen'): p_gen = linear([context_vector, state.c, state.h, x], 1, True) # a scalar p_gen = tf.sigmoid(p_gen) # p_gens.append(p_gen) # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer # This is V[s_t, h*_t] + b in the paper with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + [context_vector], cell.output_size, True) # outputs.append(output) return tf.stack(state), attn_dist, p_gen, output states, attn_dists, p_gens, outputs = tf.map_fn( run, tf.range(len(decoder_inputs)), dtype=(tf.float32, tf.float32, tf.float32, tf.float32)) # for i, inp in enumerate(decoder_inputs): # state, attn_dist, p_gen, output = run(i, inp) # state_list.append(state) # attn_dists.append(attn_dist) # p_gens.append(p_gen) # outputs.append(output) # state = state_list return tf.unstack(outputs), [ tf.contrib.rnn.LSTMStateTuple(elem[0], elem[1]) for elem in tf.unstack(states) ], tf.unstack(attn_dists), tf.unstack(p_gens)
def __init__(self, linear_size, num_layers, residual, batch_norm, max_norm, batch_size, learning_rate, summaries_dir, predict_14=False, dtype=tf.float32): """Creates the linear + relu model Args linear_size: integer. number of units in each layer of the model num_layers: integer. number of bilinear blocks in the model residual: boolean. Whether to add residual connections batch_norm: boolean. Whether to use batch normalization max_norm: boolean. Whether to clip weights to a norm of 1 batch_size: integer. The size of the batches used during training learning_rate: float. Learning rate to start with summaries_dir: String. Directory where to log progress predict_14: boolean. Whether to predict 14 instead of 17 joints dtype: the data type to use to store internal variables """ # There are in total 17 joints in H3.6M and 16 in MPII (and therefore in stacked # hourglass detections). We settled with 16 joints in 2d just to make models # compatible (e.g. you can train on ground truth 2d and test on SH detections). # This does not seem to have an effect on prediction performance. self.HUMAN_2D_SIZE = 16 * 2 # In 3d all the predictions are zero-centered around the root (hip) joint, so # we actually predict only 16 joints. The error is still computed over 17 joints, # because if one uses, e.g. Procrustes alignment, there is still error in the # hip to account for! # There is also an option to predict only 14 joints, which makes our results # directly comparable to those in https://arxiv.org/pdf/1611.09010.pdf self.HUMAN_3D_SIZE = 14 * 3 if predict_14 else 16 * 3 self.input_size = self.HUMAN_2D_SIZE self.output_size = self.HUMAN_3D_SIZE self.isTraining = tf.placeholder(tf.bool, name="isTrainingflag") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Summary writers for train and test runs self.train_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'train')) self.test_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'test')) self.linear_size = linear_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype, name="learning_rate") self.global_step = tf.Variable(0, trainable=False, name="global_step") decay_steps = 100000 # empirical decay_rate = 0.96 # empirical self.learning_rate = tf.train.exponential_decay( self.learning_rate, self.global_step, decay_steps, decay_rate) # === Transform the inputs === with vs.variable_scope("inputs"): # in=2d poses, out=3d poses enc_in = tf.placeholder(dtype, shape=[None, self.input_size], name="enc_in") dec_out = tf.placeholder(dtype, shape=[None, self.output_size], name="dec_out") self.encoder_inputs = enc_in self.decoder_outputs = dec_out # === Create the linear + relu combos === with vs.variable_scope("linear_model"): # === First layer, brings dimensionality up to linear_size === w1 = tf.get_variable(name="w1", initializer=kaiming, shape=[self.HUMAN_2D_SIZE, linear_size], dtype=dtype) b1 = tf.get_variable(name="b1", initializer=kaiming, shape=[linear_size], dtype=dtype) w1 = tf.clip_by_norm(w1, 1) if max_norm else w1 y3 = tf.matmul(enc_in, w1) + b1 if batch_norm: y3 = tf.layers.batch_normalization(y3, training=self.isTraining, name="batch_normalization") y3 = tf.nn.relu(y3) y3 = tf.nn.dropout(y3, self.dropout_keep_prob) # === Create multiple bi-linear layers === for idx in range(num_layers): y3 = self.two_linear(y3, linear_size, residual, self.dropout_keep_prob, max_norm, batch_norm, dtype, idx) # === Last linear layer has HUMAN_3D_SIZE in output === w4 = tf.get_variable(name="w4", initializer=kaiming, shape=[linear_size, self.HUMAN_3D_SIZE], dtype=dtype) b4 = tf.get_variable(name="b4", initializer=kaiming, shape=[self.HUMAN_3D_SIZE], dtype=dtype) w4 = tf.clip_by_norm(w4, 1) if max_norm else w4 y = tf.matmul(y3, w4) + b4 # === End linear model === # Store the outputs here self.outputs = y self.loss = tf.reduce_mean(tf.square(y - dec_out)) self.loss_summary = tf.summary.scalar('loss/loss', self.loss) # To keep track of the loss in mm self.err_mm = tf.placeholder(tf.float32, name="error_mm") self.err_mm_summary = tf.summary.scalar("loss/error_mm", self.err_mm) # Gradients and update operation for training the model. opt = tf.train.AdamOptimizer(self.learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # Update all the trainable parameters gradients = opt.compute_gradients(self.loss) self.gradients = [[] if i == None else i for i in gradients] self.updates = opt.apply_gradients(gradients, global_step=self.global_step) # Keep track of the learning rate self.learning_rate_summary = tf.summary.scalar( 'learning_rate/learning_rate', self.learning_rate) # To save the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
def __init__(self, config, mode, forward_only, feed_previous, cell_mode=None, no_previous=False, max_cell_length=None): super(BASIC_RNNModel, self).__init__(config, mode, cell_mode=cell_mode, no_previous=no_previous, max_cell_length=max_cell_length) # make sampled softmax output_projection = None softmax_loss_function = None if self.num_samples and self.num_samples < self.num_output_symbols: w = tf.get_variable("proj_w", [self.cell_units, self.num_output_symbols]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.num_output_symbols]) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=self.num_samples, num_classes=self.num_output_symbols) softmax_loss_function = sampled_loss # one to one learning task def Net(inputs,feed_previous,initial_state=None): return model_utils.basic_rnn( inputs, self.cell, num_input_symbols=self.num_input_symbols, num_output_symbols=self.num_output_symbols, embedding_size=self.embedding_size, output_projection=output_projection, feed_previous=feed_previous, initial_state=initial_state, not_shared=self.not_shared) with vs.variable_scope('SRN_Model'): self.outputs, self.losses, self.state = model_utils.make_model( self.inputs, self.targets, self.weights, lambda x,y: Net(x, feed_previous=feed_previous, initial_state=y), softmax_loss_function=softmax_loss_function, initial_state=self.initial_state) if forward_only: if output_projection is not None: for b,output in enumerate(self.outputs): self.outputs[b] = tf.matmul(output, output_projection[0]) + output_projection[1] # Gradients and SGD update operation for training the model. params = tf.compat.v1.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] #opt = tf.train.AdamOptimizer(self.learning_rate) opt = tf.compat.v1.train.AdagradOptimizer(self.learning_rate) #opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=self.max_checkpoints_to_keep)
def build_graph(self, question_hiddens, question_hiddens_mask, context_hiddens, context_hiddens_mask): #(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # q_j, q_j mask, c_i , c_i mask """ context_hiddens attend to question_hiddens. For each context_hiddens, return an attention distribution and an attention output vector. Inputs: question_hiddens: Tensor shape (batch_size, question_len, 2h). question_hiddens_mask: Tensor shape (batch_size, question_len). 1s where there's real input, 0s where there's padding context_hiddens: Tensor shape (batch_size, context_len, 2h) context_hiddens_mask: Tensor shape (batch_size, context_len). Outputs: attn_dist: Tensor shape (batch_size, context_len, question_len). For each context_hiddens, the distribution should sum to 1, and should be 0 in the value locations that correspond to padding. output: Tensor shape (batch_size, context_len, hidden_size). This is the attention output; the weighted sum of the question_hiddens (using the attention distribution as weights). """ with vs.variable_scope("BiDAFAttn"): # ******************************* # *** Build similarity matrix *** # ******************************* with vs.variable_scope("Similarity_matrix"): W_sim1 = tf.get_variable( "W_sim1_cn", shape=[self.context_hiddens_vec_size, 1]) # shape (2h, 1) W_sim2 = tf.get_variable( "W_sim2_qn", shape=[self.context_hiddens_vec_size, 1]) W_sim3 = tf.get_variable( "W_sim3_cq", shape=[self.context_hiddens_vec_size, 1]) question_len = question_hiddens.get_shape().as_list()[1] context_len = context_hiddens.get_shape().as_list()[1] # sous-matrice (W_sim1 . context_repeat) // (?,N,M) = (?, context_len, question_len) W_sim1_context = tf.tensordot( W_sim1, context_hiddens, axes=[[0], [2]], name="W_sim1_dot_cn") # (1, ?, context_len) W_sim1_context = tf.reshape( W_sim1_context, [-1, context_len, 1]) # (?, context_len, 1) W_sim1_context_repeat = tf.tile( W_sim1_context, tf.constant([1, 1, question_len ])) # (?, context_len, question_len) # sous-matrice (W_sim2 . question_repeat) // (?,N,M) = (?, context_len, question_len) W_sim2_question = tf.tensordot( W_sim2, question_hiddens, axes=[[0], [2]], name="W_sim2_dot_qn") # (1, ?, question_len) W_sim2_question = tf.reshape( W_sim2_question, [-1, question_len, 1]) # (?, question_len, 1) W_sim2_question_repeat = tf.tile( W_sim2_question, tf.constant([1, context_len, 1])) # (?, context_len*question_len, 1) W_sim2_question_repeat = tf.reshape( W_sim2_question_repeat, [-1, context_len, question_len ]) # (?, context_len, question_len) # sous-matrice (W_sim3 . context_hiddens o question_hiddens) // (?,N,M) = (?, context_len, question_len) W_sim3_times_context = tf.multiply(tf.tile( tf.transpose(W_sim3), tf.constant([context_len, 1])), context_hiddens, name="W_sim3_o_cn") W_sim3_context_question = tf.matmul( W_sim3_times_context, tf.transpose(question_hiddens, perm=[0, 2, 1]), name="W_sim3_x_qn") # (?, context_len, question_len) sim_matrix = tf.add_n( [ W_sim1_context_repeat, W_sim2_question_repeat, W_sim3_context_question ], name="sim_matrix") # shape (?, context_len, question_len) # **************************************** # *** Calculate attention distribution *** # **************************************** # *** C2Q Attention *** with vs.variable_scope("C2Q_Attention"): c2q_attn_logits = sim_matrix # shape (batch_size, context_len, question_len) c2q_attn_logits_mask = tf.expand_dims( question_hiddens_mask, 1) # shape (batch_size, 1, question_len) _, c2q_attn_dist = masked_softmax( c2q_attn_logits, c2q_attn_logits_mask, 2 ) # shape (batch_size, context_len, question_len). take softmax over question_hiddens # Use attention distribution to take weighted sum of question_hiddens c2q_output = tf.matmul( c2q_attn_dist, question_hiddens) # shape (batch_size, context_len, 2h) # Apply dropout c2q_output = tf.nn.dropout(c2q_output, self.keep_prob) # *** Q2C Attention *** with vs.variable_scope("Q2C_Attention"): # m_i q2c_attn_logits = tf.reduce_max( sim_matrix, axis=2, keep_dims=True) # shape (batch_size, context_len, 1) q2c_attn_logits_mask = tf.expand_dims( context_hiddens_mask, 2) # shape (batch_size, context_len, 1) # beta _, q2c_attn_dist = masked_softmax( q2c_attn_logits, q2c_attn_logits_mask, 1 ) # shape (batch_size, context_len, 1). take softmax over question_hiddens q2c_output = tf.reduce_sum(tf.multiply(q2c_attn_dist, context_hiddens), axis=1) # shape (batch_size, 2h) q2c_output = tf.expand_dims( q2c_output, axis=1) # shape (batch_size, 1, 2h) # Apply dropout q2c_output = tf.nn.dropout( q2c_output, self.keep_prob) # shape (batch_size, 1, 2h) return c2q_attn_dist, c2q_output, q2c_attn_dist, q2c_output
def testInitFromPartitionVar(self): checkpoint_dir = self.get_temp_dir() with self.test_session() as session: v1 = _create_partition_checkpoints(session, checkpoint_dir) # New graph and session. with ops.Graph().as_default() as g: with self.test_session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable( name="my1", shape=[100, 100], initializer=init_ops.zeros_initializer(), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=8 << 10)) my1_var_list = my1._get_variable_list() # Create another variable with different partitions than the variable in # the checkpoint. with variable_scope.variable_scope("some_other_scope"): my2 = variable_scope.get_variable( name="var1", shape=[100, 100], initializer=init_ops.zeros_initializer(), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=16 << 10)) my2_var_list = my2._get_variable_list() checkpoint_utils.init_from_checkpoint( checkpoint_dir, { "scope/var1": "some_scope/my1", "scope/": "some_other_scope/" }) session.run(variables.global_variables_initializer()) my1_values = session.run(my1_var_list) self.assertAllEqual(my1_values, v1) my2_values = session.run(my2_var_list) # Verify we created different number of partitions. self.assertNotEquals(len(my2_values), len(v1)) # Verify the values were correctly initialized inspite of different # partitions. full_my2_values = np.concatenate(my2_values, axis=0) full_v1_values = np.concatenate(v1, axis=0) self.assertAllEqual(full_my2_values, full_v1_values) # New graph and session. with ops.Graph().as_default() as g: with self.test_session(graph=g) as session: with variable_scope.variable_scope("some_scope"): my1 = variable_scope.get_variable( name="my1", shape=[100, 100], initializer=init_ops.truncated_normal_initializer(0.5), partitioner=partitioned_variables. min_max_variable_partitioner(max_partitions=5, axis=0, min_slice_size=8 << 10)) my1_var_list = my1._get_variable_list() checkpoint_utils.init_from_checkpoint( checkpoint_dir, { "scope/var1": my1_var_list, }) session.run(variables.global_variables_initializer()) my1_values = session.run(my1_var_list) self.assertAllEqual(my1_values, v1)
def rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """Creates a recurrent neural network specified by RNNCell "cell". The simplest form of RNN network generated is: state = cell.zero_state(...) outputs = [] states = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) states.append(state) return (outputs, states) However, a few other options are available: An initial state can be provided. If sequence_length is provided, dynamic calculation is performed. Dynamic calculation returns, at time t: (t >= max(sequence_length) ? (zeros(output_shape), zeros(state_shape)) : cell(input, state) Thus saving computational time when unrolling past the max sequence length. Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state: (optional) An initial state for the RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. dtype: (optional) The data type for the initial state. Required if initial_state is not provided. sequence_length: An int64 vector (tensor) size [batch_size]. scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, states) where: outputs is a length T list of outputs (one for each input) states is a length T list of states (one state following each input) Raises: TypeError: If "cell" is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ if not isinstance(cell, rnn_cell.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] states = [] with vs.variable_scope(scope or "RNN"): batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( "If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length: # Prepare variables zero_output_state = (array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype), array_ops.zeros( array_ops.pack( [batch_size, cell.state_size]), state.dtype)) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop def output_state(): return cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length: (output, state) = control_flow_ops.cond(time >= max_sequence_length, lambda: zero_output_state, output_state) else: (output, state) = output_state() outputs.append(output) states.append(state) return (outputs, states)
def two_linear(self, xin, linear_size, residual, dropout_keep_prob, max_norm, batch_norm, dtype, idx): """ Make a bi-linear block with optional residual connection Args xin: the batch that enters the block linear_size: integer. The size of the linear units residual: boolean. Whether to add a residual connection dropout_keep_prob: float [0,1]. Probability of dropping something out max_norm: boolean. Whether to clip weights to 1-norm batch_norm: boolean. Whether to do batch normalization dtype: type of the weigths. Usually tf.float32 idx: integer. Number of layer (for naming/scoping) Returns y: the batch after it leaves the block """ with vs.variable_scope("two_linear_" + str(idx)) as scope: input_size = int(xin.get_shape()[1]) # Linear 1 w2 = tf.get_variable(name="w2_" + str(idx), initializer=kaiming, shape=[input_size, linear_size], dtype=dtype) b2 = tf.get_variable(name="b2_" + str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype) w2 = tf.clip_by_norm(w2, 1) if max_norm else w2 y = tf.matmul(xin, w2) + b2 if batch_norm: y = tf.layers.batch_normalization(y, training=self.isTraining, name="batch_normalization1" + str(idx)) y = tf.nn.relu(y) y = tf.nn.dropout(y, dropout_keep_prob) # Linear 2 w3 = tf.get_variable(name="w3_" + str(idx), initializer=kaiming, shape=[linear_size, linear_size], dtype=dtype) b3 = tf.get_variable(name="b3_" + str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype) w3 = tf.clip_by_norm(w3, 1) if max_norm else w3 y = tf.matmul(y, w3) + b3 if batch_norm: y = tf.layers.batch_normalization(y, training=self.isTraining, name="batch_normalization2" + str(idx)) y = tf.nn.relu(y) y = tf.nn.dropout(y, dropout_keep_prob) # Residual every 2 blocks y = (xin + y) if residual else y return y
def build_graph(self, values, values_mask): """ Keys attend to values. For each key, return an attention distribution and an attention output vector. Inputs: values: Tensor shape (batch_size, num_values, value_vec_size). values_mask: Tensor shape (batch_size, num_values). 1s where there's real input, 0s where there's padding keys: Tensor shape (batch_size, num_keys, value_vec_size) Outputs: attn_dist: Tensor shape (batch_size, num_keys, num_values). For each key, the distribution should sum to 1, and should be 0 in the value locations that correspond to padding. output: Tensor shape (batch_size, num_keys, hidden_size). This is the attention output; the weighted sum of the values (using the attention distribution as weights). """ with vs.variable_scope("DotAttn"): if self.advanced_dot_attn: v1 = tf.layers.dense(values, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="W1") v2 = tf.layers.dense(values, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="W2") else: v1 = tf.layers.dense(values, self.value_vec_size, use_bias=False, name="W1") v2 = tf.layers.dense(values, self.value_vec_size, use_bias=False, name="W2") if self.advanced_dot_attn: self_attn_logits = tf.matmul( v1, tf.transpose(v2, [0, 2, 1]) / np.sqrt(self.value_vec_size)) else: self_attn_logits = tf.matmul(v1, tf.transpose(v2, [0, 2, 1])) self_attn_logits_mask = tf.expand_dims( values_mask, 1) # (batch_size, 1, num_values) _, self_attn_dist = masked_softmax( self_attn_logits, self_attn_logits_mask, 2) # (batch_size, num_values, num_values) # Use attention distribution to take weighted sum of values output = tf.matmul( self_attn_dist, values) # shape (batch_size, num_values, value_vec_size) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) return self_attn_dist, output
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: with vs.variable_scope(unit_scope) as projection_scope: if self._num_unit_shards is not None: projection_scope.set_partitioner(None) w_f_diag = vs.get_variable( "w_f_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "w_i_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) m = _linear(m, self._num_proj, bias=False) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state, f
def build_graph(self, questions, questions_mask, contexts, contexts_mask): with vs.variable_scope("AnswerPointerLayerStart"): ###### start answer pooling ###### Vrq = tf.get_variable( "v_answer_pooling", shape=[1, self.value_vec_size], initializer=tf.contrib.layers.xavier_initializer()) questions_input_lens = tf.reduce_sum(questions_mask, reduction_indices=1) with vs.variable_scope("RNNLayer1"): (self.fw_out1, self.bw_out1), _ = tf.nn.bidirectional_dynamic_rnn( self.rnn_cell_fw1, self.rnn_cell_bw1, questions, questions_input_lens, dtype=tf.float32) # (batch_size, question_len, hidden_size * 4) questions_out1 = tf.concat([self.fw_out1, self.bw_out1], 2) with vs.variable_scope("RNNLayer2"): (self.fw_out2, self.bw_out2), _ = tf.nn.bidirectional_dynamic_rnn( self.rnn_cell_fw2, self.rnn_cell_bw2, questions_out1, questions_input_lens, dtype=tf.float32) # (batch_size, question_len, hidden_size * 8) questions_out2 = tf.concat([self.fw_out2, self.bw_out2], 2) with vs.variable_scope("RNNLayer3"): (self.fw_out3, self.bw_out3), _ = tf.nn.bidirectional_dynamic_rnn( self.rnn_cell_fw3, self.rnn_cell_bw3, questions_out2, questions_input_lens, dtype=tf.float32) # (batch_size, question_len, hidden_size * 16) questions_out3 = tf.concat([self.fw_out3, self.bw_out3], 2) # (1, value_vec_size) k = tf.layers.dense(Vrq, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="Wvrq") # (batch_size, question_len, value_vec_size) v = tf.layers.dense(questions_out3, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="Wv") # (1, 1, value_vec_size) expanded_k = tf.expand_dims(k, 0) # (batch_size, question_len, value_vec_size) attn_logits_temp = tf.nn.tanh(expanded_k + v) # (batch_size, question_len, 1) attn_logits_projected = tf.layers.dense(attn_logits_temp, 1, use_bias=False) # (batch_size, 1, question_len) attn_logits = tf.transpose(attn_logits_projected, [0, 2, 1]) attn_logits_mask = tf.expand_dims( questions_mask, 1) # shape (batch_size, 1, question_len) _, attn_dist = masked_softmax( attn_logits, attn_logits_mask, 2) # shape (batch_size, 1, question_len) # (batch_size, 1, value_vec_size) rQ = tf.matmul(attn_dist, v) ###### end answer pooling ###### # (batch_size, 1, value_vec_size) k1 = tf.layers.dense(rQ, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="Wrq") #print "k1 shape: " + str(k1.get_shape()) # (batch_size, context_len, value_vec_size) v1 = tf.layers.dense(contexts, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="Wp") #print "v1 shape: " + str(v1.get_shape()) # (batch_size, context_len, value_vec_size) attn_logits1_temp = tf.tanh(k1 + v1) #print "attn_logits1_temp shape: " + str(attn_logits1_temp.get_shape()) # (batch_size, context_len, 1) attn_logits1_projected = tf.layers.dense(attn_logits1_temp, 1, use_bias=False) #print "attn_logits1_projected shape: " + str(attn_logits1_projected.get_shape()) # (batch_size, context_len) squeezed_attn_logits1 = tf.squeeze(attn_logits1_projected, axis=[2]) #print "squeezed_attn_logits1 shape: " + str(squeezed_attn_logits1.get_shape()) # (batch_size, context_len) masked_logits1, prob_dist = masked_softmax(squeezed_attn_logits1, contexts_mask, 1) return rQ, masked_logits1, prob_dist
def bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None): """Creates a bidirectional recurrent neural network. Similar to the unidirectional case above (rnn) but takes input and builds independent forward and backward RNNs with the final forward and backward outputs depth-concatenated, such that the output will have the format [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of forward and backward cell must match. The initial state for both directions is zero by default (but can be set optionally) and no intermediate states are ever returned -- the network is fully unrolled for the given (passed in) length(s) of the sequence(s) or completely unrolled if length(s) is not given. Args: cell_fw: An instance of RNNCell, to be used for forward direction. cell_bw: An instance of RNNCell, to be used for backward direction. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state_fw: (optional) An initial state for the forward RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. initial_state_bw: (optional) Same as for initial_state_fw. dtype: (optional) The data type for the initial state. Required if either of the initial states are not provided. sequence_length: (optional) An int64 vector (tensor) of size [batch_size], containing the actual lengths for each of the sequences. scope: VariableScope for the created subgraph; defaults to "BiRNN" Returns: A set of output `Tensors` where: outputs is a length T list of outputs (one for each input), which are depth-concatenated forward and backward outputs Raises: TypeError: If "cell_fw" or "cell_bw" is not an instance of RNNCell. ValueError: If inputs is None or an empty list. """ if not isinstance(cell_fw, rnn_cell.RNNCell): raise TypeError("cell_fw must be an instance of RNNCell") if not isinstance(cell_bw, rnn_cell.RNNCell): raise TypeError("cell_bw must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") name = scope or "BiRNN" # Forward direction with vs.variable_scope(name + "_FW"): output_fw, _ = rnn(cell_fw, inputs, initial_state_fw, dtype) # Backward direction with vs.variable_scope(name + "_BW"): tmp, _ = rnn(cell_bw, _reverse_seq(inputs, sequence_length), initial_state_bw, dtype) output_bw = _reverse_seq(tmp, sequence_length) # Concat each of the forward/backward outputs outputs = [ array_ops.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw) ] return outputs
def _set_scope(self, scope=None): if self._scope is None: if not self._first_parent: first_parent = self._first_parent else: first_parent = self._first_parent() if first_parent is None: # If we were never added to another Network, or that Network has beed # garbage collected before being called, then we're a top-level Network. self._finalize_name( # Use False to make sure the value sticks and we don't inherit a # parent if we're added to a network later. parent_network=False) if scope is not None: raise ValueError( "Networks may not be created with explicit scopes.") if first_parent: first_parent._set_scope() parent_scope = first_parent._scope else: parent_scope = self._default_parent_variable_scope with variable_scope.variable_scope(parent_scope) as parent_vs: expected_scope_name = parent_vs.name + "/" + self._name if expected_scope_name in self._variable_scope_counts_on_init: raise ValueError(( "A Network named '%s' already exists (or a variable_scope was " "created with this name). Names must be unique.") % (self._name, )) # Make sure variables with this prefix will be unique. with variable_scope.variable_scope( None, use_resource=True, default_name=self._name) as scope: self._scope = scope scope_name = scope.name suffix_start = scope_name.rfind("/") + 1 # rfind is -1 if there is no slash in the string, in which case the # suffix starts at the beginning of the string (there is no prefix). scope_suffix = scope_name[suffix_start:] scope_prefix = scope_name[:suffix_start] if scope_suffix != self._name: raise ValueError(( "A Network named '%s' already exists (or a variable_scope was " "created with this name). Names must be unique.") % (self._name, )) if (first_parent and scope_prefix[:-1] != first_parent.scope_name): raise ValueError(( "Network variable names must match a nesting of sub-Network " "names. Expected prefix '%s' from parent network, but got " "'%s' when attempting to create a variable_scope for Network " "'%s'. Likely an explicit variable_scope was inserted into " "the nesting.") % (first_parent.scope_name, scope_prefix[:-1], self._name)) elif not first_parent and scope_prefix: # For the case when this Network is not nested inside any other # Network, but is in a variable_scope. This Network's name takes on # the full variable scope prefix. self._name = scope_name for non_network_sublayer in self._non_network_sublayers: self._set_scope_for_nonnetwork_sublayer(non_network_sublayer)
def build_graph(self, values, values_mask, keys): """ Keys attend to values. For each key, return an attention distribution and an attention output vector. Inputs: values: Tensor shape (batch_size, num_values, value_vec_size). values_mask: Tensor shape (batch_size, num_values). 1s where there's real input, 0s where there's padding keys: Tensor shape (batch_size, num_keys, value_vec_size) Outputs: attn_dist: Tensor shape (batch_size, num_keys, num_values). For each key, the distribution should sum to 1, and should be 0 in the value locations that correspond to padding. output: Tensor shape (batch_size, num_keys, hidden_size). This is the attention output; the weighted sum of the values (using the attention distribution as weights). """ with vs.variable_scope("BasicAttn"): if self.advanced_basic_attn: k = tf.layers.dense(keys, self.key_vec_size, activation=tf.nn.relu, use_bias=False, name="Wk") v = tf.layers.dense(values, self.value_vec_size, activation=tf.nn.relu, use_bias=False, name="Wv") else: k = keys v = values # Calculate attention distribution values_t = tf.transpose( v, perm=[0, 2, 1]) # (batch_size, value_vec_size, num_values) if self.advanced_basic_attn: attn_logits = tf.matmul( k, values_t / np.sqrt(self.value_vec_size)) else: attn_logits = tf.matmul( k, values_t) # shape (batch_size, num_keys, num_values) attn_logits_mask = tf.expand_dims( values_mask, 1) # shape (batch_size, 1, num_values) _, attn_dist = masked_softmax( attn_logits, attn_logits_mask, 2 ) # shape (batch_size, num_keys, num_values). take softmax over values # Use attention distribution to take weighted sum of values output = tf.matmul( attn_dist, values) # shape (batch_size, num_keys, value_vec_size) # Apply dropout output = tf.nn.dropout(output, self.keep_prob) return attn_dist, output