def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth, num_layers, max_time, compiled): with variable_scope.variable_scope( "root", initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)): inputs = variable_scope.get_variable( "inputs", initializer=random_ops.random_uniform( (max_time, batch_size, input_depth), seed=1)) maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c cell = core_rnn_cell_impl.MultiRNNCell( [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units)) for _ in range(num_layers)]) initial_state = cell.zero_state( batch_size=batch_size, dtype=dtypes.float32) outputs, final_state = rnn.dynamic_rnn( cell=cell, inputs=inputs, initial_state=initial_state, time_major=True) flat_final_state = nest.flatten(final_state) trainable_variables = variables.trainable_variables() outputs_grad = gradients_impl.gradients( [outputs], trainable_variables + [inputs] + nest.flatten(initial_state)) final_state_grad = gradients_impl.gradients( flat_final_state, trainable_variables + [inputs] + nest.flatten(initial_state)) return {"outputs": outputs, "final_state": flat_final_state, "outputs_grad": outputs_grad, "final_state_grad": final_state_grad}
def _Model(x): w = variable_scope.get_variable( "w", (64, 64), initializer=init_ops.random_uniform_initializer(seed=312)) b = variable_scope.get_variable( "b", (64), initializer=init_ops.zeros_initializer()), return math_ops.sigmoid(math_ops.matmul(x, w) + b)
def _get_initializer(init_bound, dtype, seed): if dtype == dtypes.float16: return _MaskedRandomUniformInitializer( -init_bound, init_bound, dtype=dtype, seed=seed) else: return init_ops.random_uniform_initializer( -init_bound, init_bound, dtype=dtype, seed=seed)
def testBlockGRUToGRUCellSingleStep(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper2" with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embeddings = [] for i in xrange(len(self._embedding_classes)): embeddings.append(vs.get_variable("embedding"+str(i), [self._embedding_classes[i], self._embedding_sizes[i]], initializer=initializer)) embedded = [] for i in xrange(len(self._embedding_classes)): embedded.append(embedding_ops.embedding_lookup( embeddings[i], array_ops.reshape(inputs[i], [-1]))) finalEmbedded = tf.concat(1, embedded) return self._cell(finalEmbedded, state)
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse): with ops.device("/cpu:0"): if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) if type(state) is tuple: data_type = state[0].dtype else: data_type = state.dtype embedding = vs.get_variable( "embedding", [self._embedding_classes, self._embedding_size], initializer=initializer, dtype=data_type) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) return self._cell(embedded, state)
def model_fn(): x = variable_scope.get_variable( 'x', shape=(2, 3), initializer=init_ops.random_uniform_initializer( 1.0, 10.0, dtype=dtypes.float32)) return array_ops.identity(x)
def build(self, _): self.embedding = self.add_variable( 'embedding_kernel', shape=[self.vocab_size, self.embedding_dim], dtype=np.float32, initializer=init_ops.random_uniform_initializer(-0.1, 0.1), trainable=True)
def create_ops(): with variable_scope.variable_scope( "root", initializer=init_ops.random_uniform_initializer( -0.1, 0.1, seed=2)): inputs = variable_scope.get_variable("var", (1,)) return inputs
def create_ops(): with variable_scope.variable_scope( "root", initializer=init_ops.random_uniform_initializer( -0.1, 0.1, seed=2)): inputs = random_ops.random_uniform((1,), seed=1) return inputs
def benchmarkTfRNNLSTMTraining(self): test_configs = self._GetTestConfig() for config_name, config in test_configs.items(): num_layers = config["num_layers"] num_units = config["num_units"] batch_size = config["batch_size"] seq_length = config["seq_length"] with ops.Graph().as_default(), ops.device("/gpu:0"): inputs = seq_length * [ array_ops.zeros([batch_size, num_units], dtypes.float32) ] initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127) cell = core_rnn_cell_impl.LSTMCell( num_units=num_units, initializer=initializer, state_is_tuple=True) multi_cell = core_rnn_cell_impl.MultiRNNCell([cell] * num_layers) outputs, final_state = core_rnn.static_rnn( multi_cell, inputs, dtype=dtypes.float32) trainable_variables = ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES) gradients = gradients_impl.gradients([outputs, final_state], trainable_variables) training_op = control_flow_ops.group(*gradients) self._BenchmarkOp(training_op, "tf_rnn_lstm %s %s" % (config_name, self._GetConfigDesc(config)))
def __call__(self, inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper" with ops.device("/cpu:0"): if self._embedding: embedding = self._embedding else: if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embedding = vs.get_variable("embedding", [self._embedding_classes, self._cell.input_size], initializer=initializer) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) """print (embedded) print ("{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}")""" return self._cell(embedded, state)
def testWarmStartInputLayerEmbeddingColumn(self): # Create old and new vocabs for embedding column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: _ = variable_scope.get_variable( "input_layer/sc_vocab_embedding/embedding_weights", initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # Create feature columns. sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) emb_vocab = fc.embedding_column( categorical_column=sc_vocab, dimension=2, # Can't use constant_initializer with load_and_remap. In practice, # use a truncated normal initializer. initializer=init_ops.random_uniform_initializer( minval=0.42, maxval=0.42)) all_deep_cols = [emb_vocab] # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = {} with variable_scope.variable_scope("", partitioner=_partitioner): # Create the variables. fc.input_layer( features=self._create_dummy_inputs(), feature_columns=all_deep_cols, cols_to_vars=cols_to_vars) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), col_to_prev_vocab={ emb_vocab: prev_vocab_path }) ws_util._warmstart_input_layer(cols_to_vars, ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # emb_vocab should be correctly warmstarted after vocab remapping. # Missing values are filled in with the EmbeddingColumn's initializer. self._assert_cols_to_vars( cols_to_vars, { emb_vocab: [ np.array([[3., 3.3], [2., 2.2], [1., 1.1]]), np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]) ] }, sess)
def _build(self): """ build embedding table and build position embedding table if timing=="emb" :return: """ self._embeddings = variable_scope.get_variable( name=(self._name or "embedding_table"), shape=[self._vocab_size, self._dimension], initializer=init_ops.random_uniform_initializer( -self._init_scale, self._init_scale)) if self._timing == "emb": self._position_embedding = variable_scope.get_variable( name=(self._name or "embedding_table") + "_posi", shape=[self._maximum_position, self._dimension], initializer=init_ops.random_uniform_initializer( -self._init_scale, self._init_scale))
def testRandomInitializer(self): # Sanity check that the slices uses a different seed when using a random # initializer function. with self.test_session(): var0, var1 = partitioned_variables.create_partitioned_variables( [20, 12], [1, 2], init_ops.random_uniform_initializer()) variables.global_variables_initializer().run() val0, val1 = var0.eval().flatten(), var1.eval().flatten() self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6) # Negative test that proves that slices have the same values if # the random initializer uses a seed. with self.test_session(): var0, var1 = partitioned_variables.create_partitioned_variables( [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201)) variables.global_variables_initializer().run() val0, val1 = var0.eval().flatten(), var1.eval().flatten() self.assertAllClose(val0, val1)
def testBlockGRUToGRUCellMultiStep(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 3 time_steps = 4 # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder( dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) block_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) basic_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Check the lengths of the outputs_dynamic, and states. self.assertEqual(len(block_res), len(basic_res)) self.assertEqual(len(block_res[0]), len(basic_res[0])) self.assertEqual(len(block_res[1]), len(basic_res[1])) # Check the outputs_dynamic values. for block_output, basic_output in zip(block_res[0], basic_res[0]): self.assertAllClose(block_output, basic_output) # Check the state_dynamic value. self.assertAllClose(block_res[1], block_res[1])
def _createStackBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder(dtypes.int64) self.cells_fw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] inputs_c = array_ops.stack(inputs) inputs_c = array_ops.transpose(inputs_c, [1, 0, 2]) outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs_c, initial_states_fw=initial_states_fw, initial_states_bw=initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) # Outputs has shape (batch_size, max_length, 2* layer[-1]. output_shape = [None, max_length, 2 * self.layers[-1]] if use_shape: output_shape[0] = batch_size self.assertAllEqual(outputs.get_shape().as_list(), output_shape) input_value = np.random.randn(batch_size, input_size) return input_value, inputs, outputs, st_fw, st_bw, sequence_length
def _createStackBidirectionalRNN(self, use_gpu, use_shape, use_sequence_length, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder( dtypes.int64) if use_sequence_length else None self.cells_fw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn( self.cells_fw, self.cells_bw, inputs, initial_states_fw, initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) self.assertEqual(len(outputs), len(inputs)) for out in outputs: self.assertAlmostEqual( out.get_shape().as_list(), [batch_size if use_shape else None, 2 * self.layers[-1]]) input_value = np.random.randn(batch_size, input_size) outputs = array_ops.stack(outputs) return input_value, inputs, outputs, state_fw, state_bw, sequence_length
def testTimeReversedFusedRNN(self): with self.test_session() as sess: initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=19890213) fw_cell = core_rnn_cell_impl.BasicRNNCell(10) bw_cell = core_rnn_cell_impl.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) # test bi-directional rnn with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn( fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_fw_state, basic_bw_state = sess.run( [packed_outputs, fw_state, bw_state]) basic_grads = sess.run(gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10)) fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN( fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10))) fw_outputs, fw_state = fused_cell( inputs, dtype=dtypes.float64, scope="fw") bw_outputs, bw_state = fused_bw_cell( inputs, dtype=dtypes.float64, scope="bw") outputs = array_ops.concat([fw_outputs, bw_outputs], 2) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] sess.run([variables.global_variables_initializer()]) fused_outputs, fused_fw_state, fused_bw_state = sess.run( [outputs, fw_state, bw_state]) fused_grads = sess.run(gradients_impl.gradients(outputs, inputs)) fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_fw_state, fused_fw_state) self.assertAllClose(basic_bw_state, fused_bw_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def inference_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark inference speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with benchmarking.device(use_gpu): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_time_inference = benchmarking.seconds_per_run( outputs_dynamic, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) block_time_inference = benchmarking.seconds_per_run( outputs_dynamic, sess, iters) performance_inference = (basic_time_inference - block_time_inference ) * 100 / basic_time_inference print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str( use_gpu), str(basic_time_inference), str(block_time_inference), str( performance_inference) ])) return basic_time_inference, block_time_inference
def testLSTMBasicToBlockCellPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: x = array_ops.zeros([1, 2]) x_values = np.random.randn(1, 2) m0_val = 0.1 * np.ones([1, 2]) m1_val = -0.1 * np.ones([1, 2]) m2_val = -0.2 * np.ones([1, 2]) m3_val = 0.2 * np.ones([1, 2]) initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell( [ core_rnn_cell_impl.LSTMCell( 2, use_peepholes=True, state_is_tuple=True) ] * 2, state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) with variable_scope.variable_scope("block", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell( [lstm_ops.LSTMBlockCell( 2, use_peephole=True)] * 2, state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) self.assertEqual(len(basic_res), len(block_res)) for basic, block in zip(basic_res, block_res): self.assertAllClose(basic, block)
def glorot_initializer(in_size, out_size): """ Normalized initialization proposed for variance stabilization per layer Links: Understanding the difficulty of training deep feedforward neural networks http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf """ width = math.sqrt(6.0 / (in_size + out_size)) return init_ops.random_uniform_initializer(-width, width)
def _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length): (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list() initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127) cell = contrib_rnn.LSTMCell( num_units=input_size, use_peepholes=True, initializer=initializer, state_is_tuple=False) outputs, final_state = rnn.dynamic_rnn( cell, inputs_t, sequence_length=sequence_length, dtype=dtypes.float32) trainable_variables = ops_lib.get_collection( ops_lib.GraphKeys.TRAINABLE_VARIABLES) gradients = gradients_impl.gradients([outputs, final_state], trainable_variables) return control_flow_ops.group(final_state, outputs, *gradients)
def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) # TODO(sibyl-vie3Poto): Allow higher dimension inputs. Currently the input is expected # to have shape [batch_size, dimension]. if input_shape.rank != 2: raise ValueError( 'The rank of the input tensor should be 2. Got {} instead.'.format( input_shape.ndims)) if input_shape.dims[1].value is None: raise ValueError( 'The last dimension of the inputs to `RandomFourierFeatures` ' 'should be defined. Found `None`.') self.input_spec = input_spec.InputSpec( ndim=2, axes={1: input_shape.dims[1].value}) input_dim = input_shape.dims[1].value kernel_initializer = _get_random_features_initializer( self.kernel_initializer, shape=(input_dim, self.output_dim)) unscaled_kernel = self.add_weight( name='unscaled_random_features', shape=(input_dim, self.output_dim), dtype=dtypes.float32, initializer=kernel_initializer, trainable=False) self.bias = self.add_weight( name='random_features_bias', shape=(self.output_dim,), dtype=dtypes.float32, initializer=init_ops.random_uniform_initializer( minval=0.0, maxval=2 * np.pi, dtype=dtypes.float32), trainable=False) if self.scale is None: self.scale = _get_default_scale(self.kernel_initializer, input_dim) scale = self.add_weight( name='random_features_scale', shape=(1,), dtype=dtypes.float32, initializer=init_ops.constant_initializer(self.scale), trainable=True, constraint='NonNeg') self.kernel = (1.0 / scale) * unscaled_kernel super(RandomFourierFeatures, self).build(input_shape)
def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length): (_, input_size) = inputs_list_t[0].get_shape().as_list() initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127) cell = rnn_cell_impl.LSTMCell( num_units=input_size, use_peepholes=True, initializer=initializer, state_is_tuple=False) outputs, final_state = rnn.static_rnn( cell, inputs_list_t, sequence_length=sequence_length, dtype=dtypes.float32) trainable_variables = ops_lib.get_collection( ops_lib.GraphKeys.TRAINABLE_VARIABLES) gradients = gradients_impl.gradients(outputs + [final_state], trainable_variables) return control_flow_ops.group(final_state, *(gradients + outputs))
def __call__(self, combine_inputs, state, scope=None): """Run the cell on embedded inputs.""" with vs.variable_scope(scope or type(self).__name__): # "EmbeddingWrapper" with ops.device("/cpu:0"): inputs = combine_inputs[0] alphabetEnc = combine_inputs[1] print ("************************************************************************") print (inputs) print ("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print (alphabetEnc) print ("========================================================================") if self._embedding: embedding = self._embedding else: if self._initializer: initializer = self._initializer elif vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) embedding = vs.get_variable("embedding", [self._embedding_classes, self._cell.input_size], initializer=initializer) embedded = embedding_ops.embedding_lookup( embedding, array_ops.reshape(inputs, [-1])) print (embedded) print ("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") combine_embedded = array_ops.concat(1,[embedded,alphabetEnc]) print (combine_embedded) print ("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") return self._cell(combine_embedded, state)
def single_bprop_step_gru_block_vs_gru_cell(batch_size, cell_size, input_size, use_gpu=False, iters=30): """Benchmark single bprop step speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with benchmarking.device(use_gpu): initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989) # Inputs x = vs.get_variable("x", [batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) basic_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) block_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input, sess, iters) performance_inference = ( basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop print(",".join([ str(batch_size), str(cell_size), str(input_size), str(use_gpu), str( basic_time_bprop), str(block_time_bprop), str(performance_inference) ])) return basic_time_bprop, block_time_bprop
def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, batch_size, state_size, decoder_inputs_positions=None, decoder_inputs_maps=None, output_size=None, feed_previous=False, dtype=dtypes.float32, scope=None): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: a list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: integer, how many symbols come into the embedding. batch_size: need to clarify for decoding. decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3]. decoder_inputs_maps: a 1D Tensor of length batch_size. output_size: size of the output vectors; if None, use cell.output_size. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. attentions: a list of 2D Tensors of shape [batch_size, cell.state_size]. environments: a list of 2D Tensors of shape [batch_size, state_size]. Raises: ValueError: when output_projection has the wrong shape. Modification: No output projection. """ if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "embedding_attention_decoder"): with ops.device("/cpu:0"): embedding = vs.get_variable( "embedding", shape=[num_symbols, cell.input_size], initializer=init_ops.random_uniform_initializer(-0.08, 0.08)) def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) return emb_prev # beam search loop_function = None if feed_previous: loop_function = extract_argmax_and_embed emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] return attention_decoder( emb_inp, initial_state, attention_states, cell, batch_size, state_size, decoder_inputs_positions=decoder_inputs_positions, decoder_inputs_maps=decoder_inputs_maps, output_size=output_size, loop_function=loop_function)
def testLSTMBasicToBlockPeeping(self): with self.test_session(use_gpu=True) as sess: batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 5 inputs = [] for _ in range(sequence_length): inp = ops.convert_to_tensor(np.random.randn( batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): cell = rnn_cell.LSTMCell(cell_size, use_peepholes=True, state_is_tuple=True) outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([outputs, state[0]]) basic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(outputs, variables.trainable_variables())) with variable_scope.variable_scope("block", initializer=initializer): w = variable_scope.get_variable( "w", shape=[input_size + cell_size, cell_size * 4], dtype=dtypes.float32) b = variable_scope.get_variable( "b", shape=[cell_size * 4], dtype=dtypes.float32, initializer=init_ops.zeros_initializer()) wci = variable_scope.get_variable("wci", shape=[cell_size], dtype=dtypes.float32) wcf = variable_scope.get_variable("wcf", shape=[cell_size], dtype=dtypes.float32) wco = variable_scope.get_variable("wco", shape=[cell_size], dtype=dtypes.float32) _, _, _, _, _, _, outputs = block_lstm(ops.convert_to_tensor( sequence_length, dtype=dtypes.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) sess.run([variables.global_variables_initializer()]) block_outputs = sess.run(outputs) block_grads = sess.run( gradients_impl.gradients(outputs, inputs)) block_wgrads = sess.run( gradients_impl.gradients(outputs, [w, b, wci, wcf, wco])) self.assertAllClose(basic_outputs, block_outputs) self.assertAllClose(basic_grads, block_grads) for basic, block in zip(basic_wgrads, block_wgrads): self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2) with variable_scope.variable_scope("fused", initializer=initializer): cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=True) outputs, state = cell(inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) fused_outputs, fused_state = sess.run([outputs, state[0]]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_state, fused_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def testBasicRNNFusedWrapper(self): """This test checks that using a wrapper for BasicRNN works as expected.""" with self.test_session() as sess: initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) cell = core_rnn_cell_impl.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, state = core_rnn.static_rnn(cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([packed_outputs, state]) basic_grads = sess.run( gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused_static", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10)) outputs, state = fused_cell(inputs, dtype=dtypes.float64) fused_static_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused_static/") ] sess.run([variables.global_variables_initializer()]) fused_static_outputs, fused_static_state = sess.run( [outputs, state]) fused_static_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_static_wgrads = sess.run( gradients_impl.gradients(outputs, fused_static_vars)) self.assertAllClose(basic_outputs, fused_static_outputs) self.assertAllClose(basic_state, fused_static_state) self.assertAllClose(basic_grads, fused_static_grads) for basic, fused in zip(basic_wgrads, fused_static_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2) with variable_scope.variable_scope("fused_dynamic", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10), use_dynamic_rnn=True) outputs, state = fused_cell(inputs, dtype=dtypes.float64) fused_dynamic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused_dynamic/") ] sess.run([variables.global_variables_initializer()]) fused_dynamic_outputs, fused_dynamic_state = sess.run( [outputs, state]) fused_dynamic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_dynamic_wgrads = sess.run( gradients_impl.gradients(outputs, fused_dynamic_vars)) self.assertAllClose(basic_outputs, fused_dynamic_outputs) self.assertAllClose(basic_state, fused_dynamic_state) self.assertAllClose(basic_grads, fused_dynamic_grads) for basic, fused in zip(basic_wgrads, fused_dynamic_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def training_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark training speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: # Specify the device which is been used. with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) y = vs.get_variable("y", [time_steps, batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. basic_time_training = time_taken_by_op(optimizer, sess, iters) # Output from the basic GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. block_time_training = time_taken_by_op(optimizer, sess, iters) performance_training = ( basic_time_training - block_time_training) * 100 / basic_time_training print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str( use_gpu), str(basic_time_training), str(block_time_training), str( performance_training) ])) return basic_time_training, block_time_training
def testDerivativeOfBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 4 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[0:4] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_block_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[4:8] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_basic_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Check lengths of derivative results. self.assertEqual(len(d_block_res), len(d_basic_res)) # Check the value of every derivative result. for block, basic in zip(d_block_res, d_basic_res): self.assertAllClose(block, basic)
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" i_size = input_size.value - 1 # -1 to extract time times = array_ops.slice(inputs, [0, i_size], [-1, 1]) filtered_inputs = array_ops.slice(inputs, [0, 0], [-1, i_size]) # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ---------------- BEGIN ---------------- # # --------------------------------------- # tau = vs.get_variable( "T", shape=[self._num_units], initializer=random_exp_initializer(0, self.tau_init) if not self.manual_set else init_ops.constant_initializer( self.tau_init), trainable=self.trainable, dtype=dtype) r_on = vs.get_variable( "R", shape=[self._num_units], initializer=init_ops.constant_initializer(self.r_on_init), trainable=self.trainable, dtype=dtype) s = vs.get_variable( "S", shape=[self._num_units], initializer=init_ops.random_uniform_initializer(0., tau.initialized_value()) if not self.manual_set else init_ops.constant_initializer( 0.), trainable=self.trainable, dtype=dtype) # for backward compatibility (v < 0.12.0) use the following line instead of the above # initializer = init_ops.random_uniform_initializer(0., tau), dtype = dtype) tau_broadcast = tf.expand_dims(tau, dim=0) r_on_broadcast = tf.expand_dims(r_on, dim=0) s_broadcast = tf.expand_dims(s, dim=0) r_on_broadcast = tf.abs(r_on_broadcast) tau_broadcast = tf.abs(tau_broadcast) times = tf.tile(times, [1, self._num_units]) # calculate kronos gate phi = tf.div(tf.mod(tf.mod(times - s_broadcast, tau_broadcast) + tau_broadcast, tau_broadcast), tau_broadcast) is_up = tf.less(phi, (r_on_broadcast * 0.5)) is_down = tf.logical_and(tf.less(phi, r_on_broadcast), tf.logical_not(is_up)) # when manually setting, hard on over r_on, else as previous if self.manual_set: k = tf.select(tf.logical_or(is_up, is_down), tf.to_float(is_up), self.alpha * phi) else: k = tf.select(is_up, phi / (r_on_broadcast * 0.5), tf.select(is_down, 2. - 2. * (phi / r_on_broadcast), self.alpha * phi)) # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ----------------- END ----------------- # # --------------------------------------- # concat_w = _get_concat_variable( "W", [i_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=init_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [filtered_inputs, m_prev]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = tf.math_ops.matmul(m, concat_w_proj) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type # APPLY KRONOS GATE c = k * c + (1. - k) * c_prev m = k * m + (1. - k) * m_prev # END KRONOS GATE new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat(1, [c, m])) return m, new_state
def testBlockGRUToGRUCellMultiStep(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 3 time_steps = 4 # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder(dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) block_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) basic_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Check the lengths of the outputs_dynamic, and states. self.assertEqual(len(block_res), len(basic_res)) self.assertEqual(len(block_res[0]), len(basic_res[0])) self.assertEqual(len(block_res[1]), len(basic_res[1])) # Check the outputs_dynamic values. for block_output, basic_output in zip(block_res[0], basic_res[0]): self.assertAllClose(block_output, basic_output) # Check the state_dynamic value. self.assertAllClose(block_res[1], block_res[1])
def tmp(): initializer = init_ops.random_uniform_initializer(-0.01, 0.01) def lstm_cell(): hidden_size = RNN_UNIT_SIZE input_size = CONTENT_DIM cell = tf.contrib.rnn.LSTMCell(hidden_size, input_size, initializer=initializer, state_is_tuple=True) return cell if True: attn_length = 16 cells = [lstm_cell() for _ in range(2)] cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) cell = tf.contrib.rnn.AttentionCellWrapper(cell, attn_length, state_is_tuple=True) outputs, states = tf.nn.dynamic_rnn(cell, content_embeddings, sequence_length=content_lengths, dtype=tf.float32) #last_outputs = states[0][-1].h last_outputs = tf.concat([states[0][-1].h, states[-1]], 1) elif True: content_embeddings = tf.unstack(content_embeddings, 200, 1) cell = lstm_ops.LSTMBlockFusedCell(RNN_UNIT_SIZE) content_lengths = tf.cast(content_lengths, tf.int32) outputs, state = cell(content_embeddings, sequence_length=content_lengths, dtype=tf.float32) last_outputs = state.h elif True: layer_sizes = [RNN_UNIT_SIZE, RNN_UNIT_SIZE] cell = make_rnn_cell(layer_sizes, dropout_keep_prob=dropout_keep_prob, base_cell=lstm_ops.LSTMBlockCell, attn_length=16) outputs, final_state = tf.nn.dynamic_rnn( cell, content_embeddings, sequence_length=content_lengths, swap_memory=True, dtype=tf.float32) last_outputs = final_state[-1].h #last_outputs = tf.concat([final_state[-1].h, final_state[0][1]], 1) elif True: cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(2)], state_is_tuple=True) outputs, states = tf.nn.dynamic_rnn(cell, content_embeddings, sequence_length=content_lengths, dtype=tf.float32) last_outputs = states[-1].h elif True: num_hidden = RNN_UNIT_SIZE cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=num_hidden, state_is_tuple=True) cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=num_hidden, state_is_tuple=True) outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, content_embeddings, sequence_length=content_lengths, dtype=tf.float32) output_fw, output_bw = outputs output_state_fw, output_state_bw = states #last_outputs = tf.concat([output_fw[:, 0], output_state_bw.h], 1) last_outputs = tf.concat([output_state_fw.h, output_state_bw.h], 1) elif True: num_hidden = RNN_UNIT_SIZE lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) content_embeddings = tf.unstack(content_embeddings, 200, 1) outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell, lstm_bw_cell, content_embeddings, sequence_length=content_lengths, dtype=tf.float32) last_outputs = outputs[-1]
def __init__(self, imgSize, vocabSize, embedSize, use_lstm, rnnHiddenSize, rnnLayers, start, end, batch_size, learning_rate, learning_rate_decay_factor, min_learning_rate, training_steps_per_epoch, keep_prob=0.5, max_gradient_norm=5.0, is_training=True): if is_training: self.global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.maximum( tf.train.exponential_decay(learning_rate, self.global_step, training_steps_per_epoch, learning_rate_decay_factor, staircase=True), min_learning_rate) self.answers_ph = tf.placeholder(tf.int32, shape=[batch_size, 10, 20], name="answers") self.answer_lengths_ph = tf.placeholder(tf.int32, shape=[batch_size, 10], name="answer_lengths") self.targets_ph = tf.placeholder(tf.int32, shape=[batch_size, 10, 21], name="targets") self.image_feature_ph = tf.placeholder(tf.float32, shape=[batch_size, imgSize], name="image_feature") self.caption_ph = tf.placeholder(tf.int32, shape=[batch_size, 40], name="caption") self.caption_length_ph = tf.placeholder(tf.int32, shape=[batch_size], name="caption_length") self.questions_ph = tf.placeholder(tf.int32, shape=[batch_size, 10, 20], name="questions") self.question_lengths_ph = tf.placeholder(tf.int32, shape=[batch_size, 10], name="question_lengths") START = tf.constant(value=[start] * batch_size) END = tf.constant(value=[end] * batch_size) # Embedding (share) with ops.device("/cpu:0"): if vs.get_variable_scope().initializer: initializer = vs.get_variable_scope().initializer else: # Default initializer for embeddings should have variance=1. sqrt3 = math.sqrt( 3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = init_ops.random_uniform_initializer( -sqrt3, sqrt3) embedding = vs.get_variable("embedding", [vocabSize, embedSize], initializer=initializer, dtype=tf.float32) START_EMB = embedding_ops.embedding_lookup(embedding, START) END_EMB = embedding_ops.embedding_lookup(embedding, END) # split placeholders and embed questions = tf.split( value=self.questions_ph, num_or_size_splits=10, axis=1) # list with length 10; questions[0]: [batch_size, 1, 20] questions = [ tf.squeeze(input=question, axis=1) for question in questions ] # list with length 10; questions[0]: [batch_size, 20] questions = [ embedding_ops.embedding_lookup(embedding, question) for question in questions ] # list with length 10; questions[0]: [batch_size, 20, embedSize] question_lengths = tf.split(value=self.question_lengths_ph, num_or_size_splits=10, axis=1) question_lengths = [ tf.squeeze(question_length) for question_length in question_lengths ] if is_training: answers = tf.split(value=self.answers_ph, num_or_size_splits=10, axis=1) answers = [tf.squeeze(input=answer, axis=1) for answer in answers] answers = [ embedding_ops.embedding_lookup(embedding, answer) for answer in answers ] answer_lengths = tf.split(value=self.answer_lengths_ph, num_or_size_splits=10, axis=1) answer_lengths = [ tf.squeeze(answer_length) for answer_length in answer_lengths ] targets = tf.split(value=self.targets_ph, num_or_size_splits=10, axis=1) targets = [tf.squeeze(input=target, axis=1) for target in targets] weights = [] for r in range(10): weight = [] answer_length = answer_lengths[r] for i in range(21): weight.append(tf.greater_equal(x=answer_length, y=i)) weight = tf.cast(x=tf.stack(values=weight, axis=1), dtype=tf.float32) # [batch_size, 21] weights.append(weight) # make RNN cell def single_cell(): return GRUCell(rnnHiddenSize) if use_lstm: def single_cell(): return BasicLSTMCell(rnnHiddenSize, state_is_tuple=False) make_cell = single_cell if rnnLayers > 1: def make_cell(): return MultiRNNCell([single_cell() for _ in range(rnnLayers)], state_is_tuple=False) encoder_cell = make_cell() decoder_cell = OutputProjectionWrapper(cell=make_cell(), output_size=vocabSize, activation=None) # caption feature caption = embedding_ops.embedding_lookup( embedding, self.caption_ph) # [batch_size, 40, embedSize] caption_length = tf.squeeze(self.caption_length_ph) with tf.variable_scope('EncoderRNN') as varscope: _, captionState = dynamic_rnn( cell=encoder_cell, inputs=caption, sequence_length=caption_length, dtype=tf.float32, scope=varscope) # [batch_size, encoder_cell.state_size] if is_training: losses = [] else: ans_word_probs = [] for r in range(10): # 1. question with tf.variable_scope('EncoderRNN', reuse=True) as varscope: _, questionState = dynamic_rnn( cell=encoder_cell, inputs=questions[r], sequence_length=question_lengths[r], dtype=tf.float32, scope=varscope) # 2. history if r == 0: historyState = captionState # 3. fusion concat = tf.concat( values=[self.image_feature_ph, questionState, historyState], axis=1) if is_training: concat = tf.nn.dropout(x=concat, keep_prob=keep_prob) with tf.variable_scope('Fusion', reuse=(r > 0)) as varscope: encoder_state = tf.contrib.layers.fully_connected( inputs=concat, num_outputs=decoder_cell.state_size, activation_fn=tf.nn.tanh, scope=varscope) # 4. decoder with tf.variable_scope('DecoderRNN', reuse=(r > 0)) as varscope: if is_training: answer = [ tf.squeeze(input=word, axis=1) for word in tf.split( value=answers[r], num_or_size_splits=20, axis=1) ] decoder_outputs, _ = rnn_decoder( decoder_inputs=[START_EMB] + answer, initial_state=encoder_state, cell=decoder_cell, loop_function=None, scope=varscope) else: self_answer = [] self_answer_emb = [] def loop_function(prev, _): prev_symbol = math_ops.argmax(prev, 1) self_answer.append( tf.cast(x=prev_symbol, dtype=tf.int32)) emb_prev = embedding_ops.embedding_lookup( embedding, prev_symbol) self_answer_emb.append(emb_prev) return emb_prev decoder_outputs, _ = rnn_decoder( decoder_inputs=[START_EMB] * 21, initial_state=encoder_state, cell=decoder_cell, loop_function=loop_function, scope=varscope) # 5. update history with tf.variable_scope('EncoderRNN', reuse=True) as varscope: _, historyState = dynamic_rnn( cell=encoder_cell, inputs=questions[r], sequence_length=question_lengths[r], initial_state=historyState, scope=varscope) if is_training: _, historyState = dynamic_rnn( cell=encoder_cell, inputs=answers[r], sequence_length=answer_lengths[r], initial_state=historyState, scope=varscope) else: self_answer = tf.stack(values=self_answer + [END], axis=1) # [batch_size, 21] self_answer_length = tf.argmax(input=tf.cast( x=tf.equal(x=self_answer, y=end), dtype=tf.float32), axis=1) self_answer_emb = tf.stack( values=self_answer_emb, axis=1) # [batch_size, 20, embSize] _, historyState = dynamic_rnn( cell=encoder_cell, inputs=self_answer_emb, sequence_length=self_answer_length, initial_state=historyState, scope=varscope) if is_training: decoder_outputs = tf.stack( values=decoder_outputs, axis=1) # [batch_size, 21, vocabSize] loss = tf.contrib.seq2seq.sequence_loss( logits=decoder_outputs, targets=targets[r], weights=weights[r], average_across_batch=False) # [batch_size] losses.append(loss) else: decoder_outputs = [ tf.log(tf.nn.softmax(decoder_output)) for decoder_output in decoder_outputs ] ans_word_probs.append( tf.stack(values=decoder_outputs, axis=1)) # [batch_size, 21, vocabSize] if is_training: losses = tf.stack(values=losses, axis=1) # [batch_size, 10] self.loss = tf.reduce_mean(losses) params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.opt_op = tf.train.AdamOptimizer( self.learning_rate).apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) else: self.ans_word_probs = tf.stack( values=ans_word_probs, axis=1) # [batch_size, 10, 21, vocabSize] self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=99999999)
def _init_weights(self, inputs): """Initialize the weights. Args: inputs: `2-D` tensor with shape `[batch_size x input_size]`, the input to the cell. Needed for calculating weight shapes. Returns: A dict of weight name:tensorflow-weight pairs. """ dtype = inputs.dtype bias_initializer = init_ops.random_uniform_initializer(-0.1, 0.1, dtype=dtype) \ if self._bias_initializer is None else self._bias_initializer weight_initializer = init_ops.random_uniform_initializer(-0.1, 0.1, dtype=dtype) \ if self._weight_initializer is None else self._weight_initializer forget_bias_initializer = init_ops.constant_initializer(1.0, dtype=dtype) \ if self._forget_bias_initializer is None else self._forget_bias_initializer weight_dict = {} # initialize shared weights with vs.variable_scope(self._shared_scope) as scope: for weight_name in self._shared_weights: if weight_name == _B_F: with vs.variable_scope(scope) as bias_scope: bias_scope.set_partitioner(None) weight = vs.get_variable( name=weight_name, shape=self._get_weight_shape(weight_name, inputs), dtype=dtype, initializer=forget_bias_initializer) elif weight_name not in _BIASES: weight = vs.get_variable(name=weight_name, shape=self._get_weight_shape( weight_name, inputs), dtype=dtype, initializer=weight_initializer) else: with vs.variable_scope(scope) as bias_scope: bias_scope.set_partitioner(None) weight = vs.get_variable(name=weight_name, shape=self._get_weight_shape( weight_name, inputs), dtype=dtype, initializer=bias_initializer) weight_dict[weight_name] = weight # initialize local weights for weight_name in _WEIGHTS | _UEIGHTS | _NEIGHBOUR_UEIGHTS: if weight_name not in self._shared_weights: weight = vs.get_variable(name=weight_name, shape=self._get_weight_shape( weight_name, inputs), dtype=dtype, initializer=weight_initializer) weight_dict[weight_name] = weight for weight_name in _BIASES: if weight_name not in self._shared_weights: if weight_name == _B_F: weight = vs.get_variable( name=weight_name, shape=self._get_weight_shape(weight_name, inputs), dtype=dtype, initializer=forget_bias_initializer) else: weight = vs.get_variable(name=weight_name, shape=self._get_weight_shape( weight_name, inputs), dtype=dtype, initializer=bias_initializer) weight_dict[weight_name] = weight return weight_dict
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError( "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units]) #initializer=self._input_initializer) self._input_kernel_top = self.add_variable( "input_kernel_top", shape=[self._num_units, self._num_units]) #initializer=self._input_initializer) self._hierarchy_kernel1 = self.add_variable( "hierarchy_kernel1", shape=[self._num_units, self._num_units]) #if self.topdown: # self._hierarchy_kernel1 = clip_ops.clip_by_norm(self._hierarchy_kernel1, self._recurrent_max_abs, axes=1) # if self._layer_idx > 1: # self._input_kernel = clip_ops.clip_by_norm(self._input_kernel, self._recurrent_max_abs, axes=0) # ''' # _input_kernel_top = None # W_l2norm = math_ops.sqrt(math_ops.matmul(self._hierarchy_kernel1, _input_kernel_top)) # _input_kernel_top = _input_kernel_top * self._recurrent_max_abs / tf.maximum(self._recurrent_max_abs_tensor, W_l2norm) # self._hierarchy_kernel1 = self._hierarchy_kernel1 * self._recurrent_max_abs / tf.maximum(self._recurrent_max_abs_tensor, W_l2norm) # ''' if self._recurrent_initializer is None: # Initialize the recurrent weights uniformly in [-max_abs, max_abs] or # [-1, 1] if max_abs exceeds 1 init_bound = 1.0 if self._recurrent_max_abs and self._recurrent_max_abs < init_bound: init_bound = self._recurrent_max_abs self._recurrent_initializer = init_ops.random_uniform_initializer( minval=-init_bound, maxval=init_bound) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value( self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._hierarchy_kernel = self.add_variable( "hierarchy_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._hierarchy_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._hierarchy_kernel = math_ops.multiply( math_ops.sign(self._hierarchy_kernel), min_abs_kernel) if self._recurrent_max_abs: self._hierarchy_kernel = clip_ops.clip_by_value( self._hierarchy_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) if self._batch_norm: self.bn = tf.keras.layers.BatchNormalization(momentum=0.9) self.built = True
def testInitializerDifferent(self): for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]: init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype) init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype) self.assertFalse(identicaltest(self, init1, init2))
class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase): def _assert_all_close(self, expected, actual, atol=0.001): if not context.executing_eagerly(): with self.cached_session() as sess: keras_backend._initialize_variables(sess) self.assertAllClose(expected, actual, atol=atol) else: self.assertAllClose(expected, actual, atol=atol) @test_util.run_in_graph_and_eager_modes() def test_invalid_output_dim(self): with self.assertRaisesRegexp( ValueError, r'`output_dim` should be a positive integer. Given: -3.'): _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0) @test_util.run_in_graph_and_eager_modes() def test_unsupported_kernel_type(self): with self.assertRaisesRegexp( ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'): _ = kernel_layers.RandomFourierFeatures(3, 'unsupported_kernel', stddev=2.0) @test_util.run_in_graph_and_eager_modes() def test_invalid_scale(self): with self.assertRaisesRegexp( ValueError, r'When provided, `scale` should be a positive float. Given: 0.0.' ): _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0) @test_util.run_in_graph_and_eager_modes() def test_invalid_input_shape(self): inputs = random_ops.random_uniform((3, 2, 4), seed=1) rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0) with self.assertRaisesRegexp( ValueError, r'The rank of the input tensor should be 2. Got 3 instead.'): _ = rff_layer(inputs) @parameterized.named_parameters( ('gaussian', 'gaussian', 10.0, False), ('random', init_ops.random_uniform_initializer, 1.0, True)) @test_util.run_in_graph_and_eager_modes() def test_random_features_properties(self, initializer, scale, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim=10, kernel_initializer=initializer, scale=scale, trainable=trainable) self.assertEqual(rff_layer.output_dim, 10) self.assertEqual(rff_layer.kernel_initializer, initializer) self.assertEqual(rff_layer.scale, scale) self.assertEqual(rff_layer.trainable, trainable) @parameterized.named_parameters(('gaussian', 'gaussian', False), ('laplacian', 'laplacian', True), ('other', init_ops.ones_initializer, True)) @test_util.run_in_graph_and_eager_modes() def test_call(self, initializer, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim=10, kernel_initializer=initializer, scale=1.0, trainable=trainable, name='random_fourier_features') inputs = random_ops.random_uniform((3, 2), seed=1) outputs = rff_layer(inputs) self.assertListEqual([3, 10], outputs.shape.as_list()) num_trainable_vars = 1 if trainable else 0 self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars) @test_util.assert_no_new_pyobjects_executing_eagerly def test_no_eager_Leak(self): # Tests that repeatedly constructing and building a Layer does not leak # Python objects. inputs = random_ops.random_uniform((5, 4), seed=1) kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs) kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs) @test_util.run_in_graph_and_eager_modes() def test_output_shape(self): inputs = random_ops.random_uniform((3, 2), seed=1) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=7, name='random_fourier_features', trainable=True) outputs = rff_layer(inputs) self.assertEqual([3, 7], outputs.shape.as_list()) @parameterized.named_parameters( ('gaussian', 'gaussian'), ('laplacian', 'laplacian'), ('other', init_ops.random_uniform_initializer)) def test_call_on_placeholder(self, initializer): with ops.Graph().as_default(): inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, kernel_initializer=initializer, name='random_fourier_features') with self.assertRaisesRegexp( ValueError, r'The last dimension of the inputs to ' '`RandomFourierFeatures` should be defined. Found `None`.' ): rff_layer(inputs) inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, kernel_initializer=initializer, name='random_fourier_features') with self.assertRaisesRegexp( ValueError, r'The last dimension of the inputs to ' '`RandomFourierFeatures` should be defined. Found `None`.' ): rff_layer(inputs) inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=5, name='random_fourier_features') rff_layer(inputs) @parameterized.named_parameters( ('gaussian', 10, 'gaussian', 2.0), ('laplacian', 5, 'laplacian', None), ('other', 10, init_ops.ones_initializer, 1.0)) @test_util.run_in_graph_and_eager_modes() def test_compute_output_shape(self, output_dim, initializer, scale): rff_layer = kernel_layers.RandomFourierFeatures(output_dim, initializer, scale=scale, name='rff') with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape(None)) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([])) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([3])) with self.assertRaises(ValueError): rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3])) with self.assertRaisesRegexp( ValueError, r'The innermost dimension of input shape must be defined.'): rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None])) self.assertEqual([None, output_dim], rff_layer.compute_output_shape((None, 3)).as_list()) self.assertEqual([None, output_dim], rff_layer.compute_output_shape( tensor_shape.TensorShape([None, 2])).as_list()) self.assertEqual([4, output_dim], rff_layer.compute_output_shape((4, 1)).as_list()) @parameterized.named_parameters( ('gaussian', 10, 'gaussian', 3.0, False), ('laplacian', 5, 'laplacian', 5.5, True), ('other', 7, init_ops.random_uniform_initializer(), None, True)) @test_util.run_in_graph_and_eager_modes() def test_get_config(self, output_dim, initializer, scale, trainable): rff_layer = kernel_layers.RandomFourierFeatures( output_dim, initializer, scale=scale, trainable=trainable, name='random_fourier_features', ) expected_initializer = initializer if isinstance(initializer, init_ops.Initializer): expected_initializer = initializers.serialize(initializer) expected_dtype = ('float32' if base_layer_utils.v2_dtype_behavior_enabled() else None) expected_config = { 'output_dim': output_dim, 'kernel_initializer': expected_initializer, 'scale': scale, 'name': 'random_fourier_features', 'trainable': trainable, 'dtype': expected_dtype, } self.assertLen(expected_config, len(rff_layer.get_config())) self.assertSameElements(list(expected_config.items()), list(rff_layer.get_config().items())) @parameterized.named_parameters( ('gaussian', 5, 'gaussian', None, True), ('laplacian', 5, 'laplacian', 5.5, False), ('other', 7, init_ops.ones_initializer(), 2.0, True)) @test_util.run_in_graph_and_eager_modes() def test_from_config(self, output_dim, initializer, scale, trainable): model_config = { 'output_dim': output_dim, 'kernel_initializer': initializer, 'scale': scale, 'trainable': trainable, 'name': 'random_fourier_features', } rff_layer = kernel_layers.RandomFourierFeatures.from_config( model_config) self.assertEqual(rff_layer.output_dim, output_dim) self.assertEqual(rff_layer.kernel_initializer, initializer) self.assertEqual(rff_layer.scale, scale) self.assertEqual(rff_layer.trainable, trainable) inputs = random_ops.random_uniform((3, 2), seed=1) outputs = rff_layer(inputs) self.assertListEqual([3, output_dim], outputs.shape.as_list()) num_trainable_vars = 1 if trainable else 0 self.assertLen(rff_layer.trainable_variables, num_trainable_vars) if trainable: self.assertEqual('random_fourier_features/random_features_scale:0', rff_layer.trainable_variables[0].name) self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars) @parameterized.named_parameters( ('gaussian', 10, 'gaussian', 3.0, True), ('laplacian', 5, 'laplacian', 5.5, False), ('other', 10, init_ops.random_uniform_initializer(), None, True)) @test_util.run_in_graph_and_eager_modes() def test_same_random_features_params_reused(self, output_dim, initializer, scale, trainable): """Applying the layer on the same input twice gives the same output.""" rff_layer = kernel_layers.RandomFourierFeatures( output_dim=output_dim, kernel_initializer=initializer, scale=scale, trainable=trainable, name='random_fourier_features') inputs = constant_op.constant( np.random.uniform(low=-1.0, high=1.0, size=(2, 4))) output1 = rff_layer(inputs) output2 = rff_layer(inputs) self._assert_all_close(output1, output2) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0), ('other', init_ops.random_uniform_initializer(), 5.0)) @test_util.run_in_graph_and_eager_modes() def test_different_params_similar_approximation(self, initializer, scale): random_seed.set_random_seed(12345) rff_layer1 = kernel_layers.RandomFourierFeatures( output_dim=3000, kernel_initializer=initializer, scale=scale, name='rff1') rff_layer2 = kernel_layers.RandomFourierFeatures( output_dim=2000, kernel_initializer=initializer, scale=scale, name='rff2') # Two distinct inputs. x = constant_op.constant([[1.0, -1.0, 0.5]]) y = constant_op.constant([[-1.0, 1.0, 1.0]]) # Apply both layers to both inputs. output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1(x) output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1(y) output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2(x) output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2(y) # Compute the inner products of the outputs (on inputs x and y) for both # layers. For any fixed random features layer rff_layer, and inputs x, y, # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor. approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1) approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2) self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)), ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0))) @test_util.run_in_graph_and_eager_modes() def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn): """Approximation is bad when output dimension is small.""" # Two distinct inputs. x = constant_op.constant([[1.0, -1.0, 0.5]]) y = constant_op.constant([[-1.0, 1.0, 1.0]]) small_output_dim = 10 random_seed.set_random_seed(1234) # Initialize layer. rff_layer = kernel_layers.RandomFourierFeatures( output_dim=small_output_dim, kernel_initializer=initializer, scale=scale, name='random_fourier_features') # Apply layer to both inputs. output_x = math.sqrt(2.0 / small_output_dim) * rff_layer(x) output_y = math.sqrt(2.0 / small_output_dim) * rff_layer(y) # The inner products of the outputs (on inputs x and y) approximates the # real value of the RBF kernel but poorly since the output dimension of the # layer is small. exact_kernel_value = exact_kernel_fn(x, y) approx_kernel_value = kernelized_utils.inner_product( output_x, output_y) abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value) if not context.executing_eagerly(): with self.cached_session() as sess: keras_backend._initialize_variables(sess) abs_error_eval = sess.run([abs_error]) self.assertGreater(abs_error_eval[0][0], 0.05) self.assertLess(abs_error_eval[0][0], 0.5) else: self.assertGreater(abs_error, 0.05) self.assertLess(abs_error, 0.5) @parameterized.named_parameters( ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)), ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0))) @test_util.run_in_graph_and_eager_modes() def test_good_kernel_approximation_multiple_inputs(self, initializer, scale, exact_kernel_fn): # Parameters. input_dim = 5 output_dim = 2000 x_rows = 20 y_rows = 30 x = constant_op.constant(np.random.uniform(size=(x_rows, input_dim)), dtype=dtypes.float32) y = constant_op.constant(np.random.uniform(size=(y_rows, input_dim)), dtype=dtypes.float32) random_seed.set_random_seed(1234) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=output_dim, kernel_initializer=initializer, scale=scale, name='random_fourier_features') # The shapes of output_x and output_y are (x_rows, output_dim) and # (y_rows, output_dim) respectively. output_x = math.sqrt(2.0 / output_dim) * rff_layer(x) output_y = math.sqrt(2.0 / output_dim) * rff_layer(y) approx_kernel_matrix = kernelized_utils.inner_product( output_x, output_y) exact_kernel_matrix = exact_kernel_fn(x, y) self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.05)
def testWarmStartEmbeddingColumnLinearModel(self): # Create old and new vocabs for embedding column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_vocab_embedding/embedding_weights", initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]]) variable_scope.get_variable( "linear_model/sc_vocab_embedding/weights", initializer=[[0.69], [0.71]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # Create feature columns. sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) emb_vocab = fc.embedding_column( categorical_column=sc_vocab, dimension=2) all_deep_cols = [emb_vocab] # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = {} with variable_scope.variable_scope("", partitioner=_partitioner): # Create the variables. fc.linear_model( features=self._create_dummy_inputs(), feature_columns=all_deep_cols, cols_to_vars=cols_to_vars) # Construct the vocab_info for the embedding weight. vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path, # Can't use constant_initializer with load_and_remap. In practice, # use a truncated normal initializer. backup_initializer=init_ops.random_uniform_initializer( minval=0.42, maxval=0.42) ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*sc_vocab.*", var_name_to_vocab_info={ "linear_model/sc_vocab_embedding/embedding_weights": vocab_info }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # emb_vocab should be correctly warmstarted after vocab remapping. # Missing values are filled in with the EmbeddingColumn's initializer. self._assert_cols_to_vars( cols_to_vars, { emb_vocab: [ # embedding_weights part 0. np.array([[3., 3.3], [2., 2.2], [1., 1.1]]), # embedding_weights part 1. np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]), # linear weights part 0. np.array([[0.69]]), # linear weights part 1. np.array([[0.71]]) ] }, sess)
def testDerivativeOfBlockGRUToGRUCellMultiSteps(self): batch_size = 2 cell_size = 3 input_size = 4 time_steps = 2 with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder( dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) feeds = {concat_x: x_values, h: h_value} # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) block_grad_res_x, block_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) basic_grad_res_x, basic_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_x, basic_grad_res_x): self.assertAllClose(block, basic) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_h, basic_grad_res_h): self.assertAllClose(block, basic)
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "lstm_cell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ---------------- BEGIN ---------------- # # --------------------------------------- # i_size = input_size.value - 1 # -1 to extract time times = array_ops.slice(inputs, [0, i_size], [-1, 1]) filtered_inputs = array_ops.slice(inputs, [0, 0], [-1, i_size]) tau = vs.get_variable( "T", shape=[self._num_units], initializer=random_exp_initializer(0, self.tau_init) if not self.manual_set else init_ops.constant_initializer(self.tau_init), trainable=self.trainable, dtype=dtype) r_on = vs.get_variable( "R", shape=[self._num_units], initializer=init_ops.constant_initializer(self.r_on_init), trainable=self.trainable, dtype=dtype) s = vs.get_variable( "S", shape=[self._num_units], initializer=init_ops.random_uniform_initializer(0., tau.initialized_value()) if not self.manual_set else init_ops.constant_initializer(0.), trainable=self.trainable, dtype=dtype) tau_broadcast = tf.expand_dims(tau, axis=0) r_on_broadcast = tf.expand_dims(r_on, axis=0) s_broadcast = tf.expand_dims(s, axis=0) r_on_broadcast = tf.abs(r_on_broadcast) tau_broadcast = tf.abs(tau_broadcast) times = tf.tile(times, [1, self._num_units]) # calculate kronos gate phi = tf.div(tf.mod(tf.mod(times - s_broadcast, tau_broadcast) + tau_broadcast, tau_broadcast), tau_broadcast) is_up = tf.less(phi, (r_on_broadcast * 0.5)) is_down = tf.logical_and(tf.less(phi, r_on_broadcast), tf.logical_not(is_up)) k = tf.where(is_up, phi / (r_on_broadcast * 0.5), tf.where(is_down, 2. - 2. * (phi / r_on_broadcast), self.alpha * phi)) lstm_matrix = math_ops.matmul(array_ops.concat([filtered_inputs, m_prev], 1), self._kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) # --------------------------------------- # # ------------- PHASED LSTM ------------- # # ----------------- END ----------------- # # --------------------------------------- # i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type # APPLY KRONOS GATE c = k * c + (1. - k) * c_prev m = k * m + (1. - k) * m_prev # END KRONOS GATE new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, batch_size, state_size, decoder_inputs_positions=None, decoder_inputs_maps=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x cell.input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: integer; number of symbols on the encoder side. num_decoder_symbols: integer; number of symbols on the decoder side. batch_size: need to clarify for decoding. decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3]. decoder_inputs_maps: a 1D Tensor of length batch_size. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. attentions: a list of 2D Tensors of shape [batch_size, cell.state_size]. environments: a list of 2D Tensors of shape [batch_size, state_size]. """ with vs.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper( cell, num_encoder_symbols, initializer=init_ops.random_uniform_initializer(-0.08, 0.08)) encoder_outputs, encoder_states = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(1, top_states) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, batch_size, state_size, decoder_inputs_positions=decoder_inputs_positions, decoder_inputs_maps=decoder_inputs_maps, output_size=output_size, feed_previous=feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. # We don't consider this case. raise ValueError("Imcompatible variable feed_previous.\n")
def __call__( self, inputs, # 输入包含 x 输入 和 t 的输入 state, # 状态包含了细胞状态和隐含层状态 scope=None): """ Phased long short-term memory cell (P-LSTM). """ with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. # 初始状态 state 是一个元组 ( c,h) if state is tuple: # 如果是元组的话,就可以直接分开 c_prev, h_prev = state else: # 如果不是元组的话, 那么就是多维数组 , 就在第二个维度对他们进行划分 c_prev, h_prev = array_ops.split(value=state, num_or_size_splits=2, axis=1) # (2, batch_size, seq_len) # NB: here we explicitly give t as input. # input的第一个维度长度为2 , 第一个元素是 x 的输入, 第二个元素是时间变量的输入 x = tf.reshape(inputs[:, 0], (-1, 1)) # 第二个维度的长度是1, 第一个维度根据需要摆放 # 取最后一个批次的所有的 时间戳变量 t = inputs[:, 1][ -1] # Now we only accept one id. We have a batch so it's a bit more complex. # maybe the information should come from the outside. To be defined later. # 就是矩阵乘法 concat = _linear([x, h_prev], 4 * self._num_units, True) # 这会儿还没有涉及到及激活函数 # 注意,这里只计算到线性组合的结果是有意义的 # 因为后面 可以线性组合的后面再加上窥视孔连接 # 的结果 # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) dtype = inputs.dtype # 忽然想到 , mask 是针对某一个 time_step 的, 而tau , r_on, s 以及算出来的 kt 是针对某一个 # 隐含层或者细胞状态 神经元的 tau = vs.get_variable( 'tau', shape=[self._num_units], # 为每一个隐含层神经元,细胞状态神经元 # 分配一个tau--周期, r_on开放比例, s 相位 initializer=random_exp_initializer(0, self.tau_init), dtype=dtype) r_on = vs.get_variable('r_on', shape=[self._num_units], initializer=init_ops.constant_initializer( self.r_on_init), dtype=dtype) s = vs.get_variable( 's', shape=[self._num_units], initializer=init_ops.random_uniform_initializer( 0., tau.initialized_value()), dtype=dtype) # tf.tile 的作用 是 rep times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units]) phase = phi(times, s, tau) # element-wise calculation kappa = time_gate_fast(phase, r_on, self._leak_rate, self._training_phase) w_o_peephole = None # # 如果使用了窥视孔连接的话,那么就把细胞状态的线性组合连接到前面线性组合的 if self._use_peepholes: w_i_peephole = vs.get_variable('W_I_peephole', shape=[self._num_units], dtype=dtype) w_f_peephole = vs.get_variable('W_F_peephole', shape=[self._num_units], dtype=dtype) w_o_peephole = vs.get_variable('W_O_peephole', shape=[self._num_units], dtype=dtype) f += w_f_peephole * c_prev i += w_i_peephole * c_prev new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation( j) if self._use_peepholes: o += w_o_peephole * new_c_tilde new_h_tilde = sigmoid(o) * self._activation(new_c_tilde) """ Hi all, Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c. I can add a point to the paper to mention that, and will update Figure 1 so the line is correctly drawn to c_tilde instead. The intuition here is that the gates should be blind to the effect of the khronos gate; input, forget and output gate should all operate as if the cell were a normal LSTM cell, while the khronos gate allows it to either operate or not operate (and then linearly interpolates between these two states). If the output gate is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation. I think everyone's right in that it wouldn't influence much -- but it should be updated in the paper. Thanks very much for pointing out the issue, Philippe! -Danny""" # Apply Khronos gate new_h = kappa * new_h_tilde + (1 - kappa) * h_prev new_c = kappa * new_c_tilde + (1 - kappa) * c_prev new_state = (new_c, new_h) # 根据采样频率更新 细胞状态 return new_h, new_state
def testTimeReversedFusedRNN(self): with self.test_session() as sess: initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890213) fw_cell = core_rnn_cell_impl.BasicRNNCell(10) bw_cell = core_rnn_cell_impl.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) # test bi-directional rnn with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn( fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_fw_state, basic_bw_state = sess.run( [packed_outputs, fw_state, bw_state]) basic_grads = sess.run( gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10)) fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN( fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10))) fw_outputs, fw_state = fused_cell(inputs, dtype=dtypes.float64, scope="fw") bw_outputs, bw_state = fused_bw_cell(inputs, dtype=dtypes.float64, scope="bw") outputs = array_ops.concat([fw_outputs, bw_outputs], 2) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] sess.run([variables.global_variables_initializer()]) fused_outputs, fused_fw_state, fused_bw_state = sess.run( [outputs, fw_state, bw_state]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_fw_state, fused_fw_state) self.assertAllClose(basic_bw_state, fused_bw_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def adaptive_softmax_loss(inputs, labels, cutoff, project_factor=4, initializer=None, name=None): """Computes and returns the adaptive softmax loss (a improvement of hierarchical softmax). See [Efficient softmax approximation for GPUs](https://arxiv.org/pdf/1609.04309v2.pdf). This is a faster way to train a softmax classifier over a huge number of classes, and can be used for **both training and prediction**. For example, it can be used for training a Language Model with a very huge vocabulary, and the trained languaed model can be used in speech recognition, text generation, and machine translation very efficiently. Args: inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or `int64`. Each entry in `labels` must be an index in `[0, num_classes)`. cutoff: A list indicating the limits of the different clusters. project_factor: A floating point value greater or equal to 1.0. The projection factor between two neighboring clusters. initializer: Initializer for adaptive softmax variables (optional). name: A name for the operation (optional). Returns: loss: A `batch_size` 1-D tensor of the adaptive softmax cross entropy loss. training_losses: A list of 1-D tensors of adaptive softmax loss for each cluster, which can be used for calculating the gradients and back propagation when training. """ input_dim = int(inputs.get_shape()[1]) sample_num = int(inputs.get_shape()[0]) cluster_num = len(cutoff) - 1 with ops.name_scope(name, "AdaptiveSoftmax"): if initializer is None: stdv = math.sqrt(1. / input_dim) initializer = init_ops.random_uniform_initializer(-stdv * 0.8, stdv * 0.8) head_dim = cutoff[0] + cluster_num head_w = variable_scope.get_variable("adaptive_softmax_head_w", [input_dim, head_dim], initializer=initializer) tail_project_factor = project_factor tail_w = [] for i in range(cluster_num): project_dim = max(1, input_dim // tail_project_factor) tail_dim = cutoff[i + 1] - cutoff[i] tail_w.append([ variable_scope.get_variable("adaptive_softmax_tail{}_proj_w".format(i+1), [input_dim, project_dim], initializer=initializer), variable_scope.get_variable("adaptive_softmax_tail{}_w".format(i+1), [project_dim, tail_dim], initializer=initializer) ]) tail_project_factor *= project_factor # Get tail masks and update head labels training_losses = [] loss = array_ops.zeros([sample_num], dtype=dtypes.float32) head_labels = labels for i in range(cluster_num): mask = math_ops.logical_and(math_ops.greater_equal(labels, cutoff[i]), math_ops.less(labels, cutoff[i + 1])) # Update head labels head_labels = tf.where(mask, array_ops.constant([cutoff[0] + i] * sample_num), head_labels) # Compute tail loss tail_inputs = array_ops.boolean_mask(inputs, mask) tail_logits = math_ops.matmul(math_ops.matmul(tail_inputs, tail_w[i][0]), tail_w[i][1]) tail_labels = array_ops.boolean_mask(labels - cutoff[i], mask) tail_loss = nn.sparse_softmax_cross_entropy_with_logits(labels=tail_labels, logits=tail_logits) training_losses.append(tail_loss) aligned_tail_loss = sparse_tensor.SparseTensor( array_ops.squeeze(array_ops.where(mask)), tail_loss, [sample_num]) loss += sparse_ops.sparse_tensor_to_dense(aligned_tail_loss) # Compute head loss head_logits = math_ops.matmul(inputs, head_w) head_loss = nn.sparse_softmax_cross_entropy_with_logits(logits=head_logits, labels=head_labels) loss += head_loss training_losses.append(head_loss) return loss, training_losses
def testLSTMBasicToBlockCellPeeping(self): with self.test_session(use_gpu=True) as sess: x = array_ops.zeros([1, 2]) x_values = np.random.randn(1, 2) m0_val = 0.1 * np.ones([1, 2]) m1_val = -0.1 * np.ones([1, 2]) m2_val = -0.2 * np.ones([1, 2]) m3_val = 0.2 * np.ones([1, 2]) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell( [ rnn_cell.LSTMCell( 2, use_peepholes=True, state_is_tuple=True) for _ in range(2) ], state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) basic_res = sess.run( [g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) with variable_scope.variable_scope("block", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell( [ lstm_ops.LSTMBlockCell(2, use_peephole=True) for _ in range(2) ], state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) block_res = sess.run( [g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) self.assertEqual(len(basic_res), len(block_res)) for basic, block in zip(basic_res, block_res): self.assertAllClose(basic, block)
def _eunn_param(hidden_size, capacity=2, fft=False, comp=True): """ Create parameters and do the initial preparations """ theta_phi_initializer = init_ops.random_uniform_initializer(-np.pi, np.pi) if fft: capacity = int(np.ceil(np.log2(hidden_size))) diag_list_0 = [] off_list_0 = [] varsize = 0 for i in range(capacity): size = capacity - i normal_size = (hidden_size // (2**size)) * (2**(size - 1)) extra_size = max(0, (hidden_size % (2**size)) - (2**(size - 1))) varsize += normal_size + extra_size params_theta = vs.get_variable("theta_0", [varsize], initializer=theta_phi_initializer) cos_theta = math_ops.cos(params_theta) sin_theta = math_ops.sin(params_theta) if comp: params_phi = vs.get_variable("phi_0", [varsize], initializer=theta_phi_initializer) cos_phi = math_ops.cos(params_phi) sin_phi = math_ops.sin(params_phi) cos_list_0 = math_ops.complex(cos_theta, array_ops.zeros_like(cos_theta)) cos_list_1 = math_ops.complex( math_ops.multiply(cos_theta, cos_phi), math_ops.multiply(cos_theta, sin_phi)) sin_list_0 = math_ops.complex(sin_theta, array_ops.zeros_like(sin_theta)) sin_list_1 = math_ops.complex( -math_ops.multiply(sin_theta, cos_phi), -math_ops.multiply(sin_theta, sin_phi)) last = 0 for i in range(capacity): size = capacity - i normal_size = (hidden_size // (2**size)) * (2**(size - 1)) extra_size = max(0, (hidden_size % (2**size)) - (2**(size - 1))) if comp: cos_list_normal = array_ops.concat([ array_ops.slice(cos_list_0, [last], [normal_size]), array_ops.slice(cos_list_1, [last], [normal_size]) ], 0) sin_list_normal = array_ops.concat([ array_ops.slice(sin_list_0, [last], [normal_size]), -array_ops.slice(sin_list_1, [last], [normal_size]) ], 0) last += normal_size cos_list_extra = array_ops.concat([ array_ops.slice(cos_list_0, [last], [extra_size]), math_ops.complex( tf.ones( [hidden_size - 2 * normal_size - 2 * extra_size]), tf.zeros( [hidden_size - 2 * normal_size - 2 * extra_size])), array_ops.slice(cos_list_1, [last], [extra_size]) ], 0) sin_list_extra = array_ops.concat([ array_ops.slice(sin_list_0, [last], [extra_size]), math_ops.complex( tf.zeros( [hidden_size - 2 * normal_size - 2 * extra_size]), tf.zeros([ hidden_size - 2 * normal_size - 2 * extra_size ])), -array_ops.slice(sin_list_1, [last], [extra_size]) ], 0) last += extra_size else: cos_list_normal = array_ops.slice(cos_theta, [last], [normal_size]) cos_list_normal = array_ops.concat( [cos_list_normal, cos_list_normal], 0) cos_list_extra = array_ops.slice(cos_theta, [last + normal_size], [extra_size]) cos_list_extra = array_ops.concat([ cos_list_extra, tf.ones([hidden_size - 2 * normal_size - 2 * extra_size]), cos_list_extra ], 0) sin_list_normal = array_ops.slice(sin_theta, [last], [normal_size]) sin_list_normal = array_ops.concat( [sin_list_normal, -sin_list_normal], 0) sin_list_extra = array_ops.slice(sin_theta, [last + normal_size], [extra_size]) sin_list_extra = array_ops.concat([ sin_list_extra, tf.zeros([hidden_size - 2 * normal_size - 2 * extra_size]), -sin_list_extra ], 0) last += normal_size + extra_size if normal_size != 0: cos_list_normal = array_ops.reshape( array_ops.transpose( array_ops.reshape(cos_list_normal, [-1, 2 * normal_size // (2**size)])), [-1]) sin_list_normal = array_ops.reshape( array_ops.transpose( array_ops.reshape(sin_list_normal, [-1, 2 * normal_size // (2**size)])), [-1]) cos_list = array_ops.concat([cos_list_normal, cos_list_extra], 0) sin_list = array_ops.concat([sin_list_normal, sin_list_extra], 0) diag_list_0.append(cos_list) off_list_0.append(sin_list) diag_vec = array_ops.stack(diag_list_0, 0) off_vec = array_ops.stack(off_list_0, 0) else: capacity_b = capacity // 2 capacity_a = capacity - capacity_b hidden_size_a = hidden_size // 2 hidden_size_b = (hidden_size - 1) // 2 params_theta_0 = vs.get_variable("theta_0", [capacity_a, hidden_size_a], initializer=theta_phi_initializer) cos_theta_0 = array_ops.reshape(math_ops.cos(params_theta_0), [capacity_a, -1, 1]) sin_theta_0 = array_ops.reshape(math_ops.sin(params_theta_0), [capacity_a, -1, 1]) params_theta_1 = vs.get_variable("theta_1", [capacity_b, hidden_size_b], initializer=theta_phi_initializer) cos_theta_1 = array_ops.reshape(math_ops.cos(params_theta_1), [capacity_b, -1, 1]) sin_theta_1 = array_ops.reshape(math_ops.sin(params_theta_1), [capacity_b, -1, 1]) if comp: params_phi_0 = vs.get_variable("phi_0", [capacity_a, hidden_size_a], initializer=theta_phi_initializer) cos_phi_0 = array_ops.reshape(math_ops.cos(params_phi_0), [capacity_a, -1, 1]) sin_phi_0 = array_ops.reshape(math_ops.sin(params_phi_0), [capacity_a, -1, 1]) cos_list_0_re = array_ops.reshape( array_ops.concat( [cos_theta_0, math_ops.multiply(cos_theta_0, cos_phi_0)], 2), [capacity_a, -1]) cos_list_0_im = array_ops.reshape( array_ops.concat([ array_ops.zeros_like(cos_theta_0), math_ops.multiply(cos_theta_0, sin_phi_0) ], 2), [capacity_a, -1]) if hidden_size_a * 2 != hidden_size: cos_list_0_re = array_ops.concat( [cos_list_0_re, tf.ones([capacity_a, 1])], 1) cos_list_0_im = array_ops.concat( [cos_list_0_im, tf.zeros([capacity_a, 1])], 1) cos_list_0 = math_ops.complex(cos_list_0_re, cos_list_0_im) sin_list_0_re = array_ops.reshape( array_ops.concat( [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)], 2), [capacity_a, -1]) sin_list_0_im = array_ops.reshape( array_ops.concat([ array_ops.zeros_like(sin_theta_0), -math_ops.multiply(sin_theta_0, sin_phi_0) ], 2), [capacity_a, -1]) if hidden_size_a * 2 != hidden_size: sin_list_0_re = array_ops.concat( [sin_list_0_re, tf.zeros([capacity_a, 1])], 1) sin_list_0_im = array_ops.concat( [sin_list_0_im, tf.zeros([capacity_a, 1])], 1) sin_list_0 = math_ops.complex(sin_list_0_re, sin_list_0_im) params_phi_1 = vs.get_variable("phi_1", [capacity_b, hidden_size_b], initializer=theta_phi_initializer) cos_phi_1 = array_ops.reshape(math_ops.cos(params_phi_1), [capacity_b, -1, 1]) sin_phi_1 = array_ops.reshape(math_ops.sin(params_phi_1), [capacity_b, -1, 1]) cos_list_1_re = array_ops.reshape( array_ops.concat( [cos_theta_1, math_ops.multiply(cos_theta_1, cos_phi_1)], 2), [capacity_b, -1]) cos_list_1_re = array_ops.concat( [tf.ones((capacity_b, 1)), cos_list_1_re], 1) cos_list_1_im = array_ops.reshape( array_ops.concat([ array_ops.zeros_like(cos_theta_1), math_ops.multiply(cos_theta_1, sin_phi_1) ], 2), [capacity_b, -1]) cos_list_1_im = array_ops.concat( [tf.zeros((capacity_b, 1)), cos_list_1_im], 1) if hidden_size_b * 2 != hidden_size - 1: cos_list_1_re = array_ops.concat( [cos_list_1_re, tf.ones([capacity_b, 1])], 1) cos_list_1_im = array_ops.concat( [cos_list_1_im, tf.zeros([capacity_b, 1])], 1) cos_list_1 = math_ops.complex(cos_list_1_re, cos_list_1_im) sin_list_1_re = array_ops.reshape( array_ops.concat( [sin_theta_1, -math_ops.multiply(sin_theta_1, cos_phi_1)], 2), [capacity_b, -1]) sin_list_1_re = array_ops.concat( [tf.zeros((capacity_b, 1)), sin_list_1_re], 1) sin_list_1_im = array_ops.reshape( array_ops.concat([ array_ops.zeros_like(sin_theta_1), -math_ops.multiply(sin_theta_1, sin_phi_1) ], 2), [capacity_b, -1]) sin_list_1_im = array_ops.concat( [tf.zeros((capacity_b, 1)), sin_list_1_im], 1) if hidden_size_b * 2 != hidden_size - 1: sin_list_1_re = array_ops.concat( [sin_list_1_re, tf.zeros([capacity_b, 1])], 1) sin_list_1_im = array_ops.concat( [sin_list_1_im, tf.zeros([capacity_b, 1])], 1) sin_list_1 = math_ops.complex(sin_list_1_re, sin_list_1_im) else: cos_list_0 = array_ops.reshape( array_ops.concat([cos_theta_0, cos_theta_0], 2), [capacity_a, -1]) sin_list_0 = array_ops.reshape( array_ops.concat([sin_theta_0, -sin_theta_0], 2), [capacity_a, -1]) if hidden_size_a * 2 != hidden_size: cos_list_0 = array_ops.concat( [cos_list_0, tf.ones([capacity_a, 1])], 1) sin_list_0 = array_ops.concat( [sin_list_0, tf.zeros([capacity_a, 1])], 1) cos_list_1 = array_ops.reshape( array_ops.concat([cos_theta_1, cos_theta_1], 2), [capacity_b, -1]) cos_list_1 = array_ops.concat( [tf.ones((capacity_b, 1)), cos_list_1], 1) sin_list_1 = array_ops.reshape( array_ops.concat([sin_theta_1, -sin_theta_1], 2), [capacity_b, -1]) sin_list_1 = array_ops.concat( [tf.zeros((capacity_b, 1)), sin_list_1], 1) if hidden_size_b * 2 != hidden_size - 1: cos_list_1 = array_ops.concat( [cos_list_1, tf.zeros([capacity_b, 1])], 1) sin_list_1 = array_ops.concat( [sin_list_1, tf.zeros([capacity_b, 1])], 1) if capacity_b != capacity_a: if comp: cos_list_1 = array_ops.concat([ cos_list_1, math_ops.complex(tf.zeros([1, hidden_size]), tf.zeros([1, hidden_size])) ], 0) sin_list_1 = array_ops.concat([ sin_list_1, math_ops.complex(tf.zeros([1, hidden_size]), tf.zeros([1, hidden_size])) ], 0) else: cos_list_1 = array_ops.concat( [cos_list_1, tf.zeros([1, hidden_size])], 0) sin_list_1 = array_ops.concat( [sin_list_1, tf.zeros([1, hidden_size])], 0) diag_vec = tf.reshape(tf.concat([cos_list_0, cos_list_1], 1), [capacity_a * 2, hidden_size]) off_vec = tf.reshape(tf.concat([sin_list_0, sin_list_1], 1), [capacity_a * 2, hidden_size]) if capacity_b != capacity_a: diag_vec = tf.slice(diag_vec, [0, 0], [capacity, hidden_size]) off_vec = tf.slice(off_vec, [0, 0], [capacity, hidden_size]) def _toTensorArray(elems): elems = ops.convert_to_tensor(elems) n = array_ops.shape(elems)[0] elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n, dynamic_size=False, infer_shape=True, clear_after_read=False) elems_ta = elems_ta.unstack(elems) return elems_ta diag_vec = _toTensorArray(diag_vec) off_vec = _toTensorArray(off_vec) if comp: omega = vs.get_variable("omega", [hidden_size], initializer=theta_phi_initializer) diag = math_ops.complex(math_ops.cos(omega), math_ops.sin(omega)) else: diag = None return diag_vec, off_vec, diag, capacity
def testLSTMFusedSequenceLengths(self): """Verify proper support for sequence lengths in LSTMBlockFusedCell.""" with self.test_session(use_gpu=True) as sess: batch_size = 3 input_size = 4 cell_size = 5 max_sequence_length = 6 inputs = [] for _ in range(max_sequence_length): inp = ops.convert_to_tensor(np.random.randn( batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) seq_lengths = constant_op.constant([3, 4, 5]) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890213) with variable_scope.variable_scope("basic", initializer=initializer): cell = rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True) outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=seq_lengths) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([outputs, state[0]]) basic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(outputs, variables.trainable_variables())) with variable_scope.variable_scope("fused", initializer=initializer): cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=False) outputs, state = cell(inputs, dtype=dtypes.float32, sequence_length=seq_lengths) sess.run([variables.global_variables_initializer()]) fused_outputs, fused_state = sess.run([outputs, state[0]]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_state, fused_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2) # Verify that state propagation works if we turn our sequence into # tiny (single-time) subsequences, i.e. unfuse the cell with variable_scope.variable_scope("unfused", initializer=initializer) as vs: cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=False) outputs = [] state = None for i, inp in enumerate(inputs): lengths = [int(i < l) for l in seq_lengths.eval()] output, state = cell([inp], initial_state=state, dtype=dtypes.float32, sequence_length=lengths) vs.reuse_variables() outputs.append(output[0]) outputs = array_ops.stack(outputs) sess.run([variables.global_variables_initializer()]) unfused_outputs, unfused_state = sess.run([outputs, state[0]]) unfused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) unfused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("unfused/") ] unfused_wgrads = sess.run( gradients_impl.gradients(outputs, unfused_vars)) self.assertAllClose(basic_outputs, unfused_outputs) self.assertAllClose(basic_state, unfused_state) self.assertAllClose(basic_grads, unfused_grads) for basic, unfused in zip(basic_wgrads, unfused_wgrads): self.assertAllClose(basic, unfused, rtol=1e-2, atol=1e-2)
def RunLSTM(sess, num_units, input_size, batch_size, time, num_layers=1, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, is_training=True, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initial_c_op = variable_scope.get_variable( "initial_c_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): w = variable_scope.get_variable( "rnn/lstm_cell/kernel", shape=[input_size + num_units, num_units * 4], dtype=dtype) b = variable_scope.get_variable( "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype) # canonical lstm. must set forget_bias to 0. to align with cudnn lstm. cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True) outputs_op, state_tuple_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=rnn_cell_impl.LSTMStateTuple( h=initial_h_op, c=initial_c_op), dtype=dtype, time_major=time_major, scope=None) # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque([w, b]) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_initial_c_op = array_ops.expand_dims( initial_c_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, cu_initial_c_op, opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_LSTM) # Remove the trivial 1st dimension. cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple( c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1), h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1)) if is_training: (inp_grad_op, hgrad_op, cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b]) (cu_inp_grad_op, cu_hgrad_op, cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) # Remove the trivial 1st dimension cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) cu_wgrad_op = cu_wgrad_op[0] cu_bgrad_op = cu_bgrad_op[0] # cudnn lstm has 2 biases each gate. When converting to tf canonical format, # the two biases are summed into one. Thus here bias gradient should be # halved when comparing with tf lstm. cu_bgrad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ outputs_op, state_tuple_op, inp_grad_op, (hgrad_op, cgrad_op), wgrad_op, bgrad_op ]) (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "state_grad: %s" % str(state_grad)) logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad)) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, state_tuple = sess.run([outputs_op, state_tuple_op]) cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) return outputs, cu_outputs, state_tuple, cu_state_tuple
def blocks_match(sess, use_peephole): batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 4 inputs = [] for _ in range(sequence_length): inp = ops.convert_to_tensor( np.random.randn(batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) stacked_inputs = array_ops.stack(inputs) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) with variable_scope.variable_scope("test", initializer=initializer): # magic naming so that the cells pick up these variables and reuse them if use_peephole: wci = variable_scope.get_variable( "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32) wcf = variable_scope.get_variable( "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtypes.float32) wco = variable_scope.get_variable( "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtypes.float32) w = variable_scope.get_variable( "rnn/lstm_cell/kernel", shape=[input_size + cell_size, cell_size * 4], dtype=dtypes.float32) b = variable_scope.get_variable( "rnn/lstm_cell/bias", shape=[cell_size * 4], dtype=dtypes.float32, initializer=init_ops.zeros_initializer()) basic_cell = rnn_cell.LSTMCell( cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True) basic_outputs_op, basic_state_op = rnn.static_rnn( basic_cell, inputs, dtype=dtypes.float32) if use_peephole: _, _, _, _, _, _, block_outputs_op = block_lstm( ops.convert_to_tensor(sequence_length, dtype=dtypes.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) else: _, _, _, _, _, _, block_outputs_op = block_lstm( ops.convert_to_tensor(sequence_length, dtype=dtypes.int64), inputs, w, b, cell_clip=0) fused_cell = lstm_ops.LSTMBlockFusedCell( cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True, name="rnn/lstm_cell") fused_outputs_op, fused_state_op = fused_cell( stacked_inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]]) basic_grads = sess.run(gradients_impl.gradients(basic_outputs_op, inputs)) xs = [w, b] if use_peephole: xs += [wci, wcf, wco] basic_wgrads = sess.run(gradients_impl.gradients(basic_outputs_op, xs)) block_outputs = sess.run(block_outputs_op) block_grads = sess.run(gradients_impl.gradients(block_outputs_op, inputs)) block_wgrads = sess.run(gradients_impl.gradients(block_outputs_op, xs)) xs = [w, b] if use_peephole: xs += [wci, wcf, wco] fused_outputs, fused_state = sess.run([fused_outputs_op, fused_state_op[0]]) fused_grads = sess.run(gradients_impl.gradients(fused_outputs_op, inputs)) fused_wgrads = sess.run(gradients_impl.gradients(fused_outputs_op, xs)) return (basic_state, fused_state, basic_outputs, block_outputs, fused_outputs, basic_grads, block_grads, fused_grads, basic_wgrads, block_wgrads, fused_wgrads)
def RunGRU(sess, num_units, input_size, batch_size, time, num_layers=1, is_training=True, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): gate_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/kernel", shape=[input_size + num_units, num_units * 2], dtype=dtype) gate_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/bias", shape=[num_units * 2], dtype=dtype) candidate_inp_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel", shape=[input_size, num_units], dtype=dtype) candidate_inp_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias", shape=[num_units], dtype=dtype) candidate_hid_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel", shape=[num_units, num_units], dtype=dtype) candidate_hid_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias", shape=[num_units], dtype=dtype) cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True) outputs_op, h_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=initial_h_op, dtype=dtype, time_major=time_major, scope=None) ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel] bs = [gate_bias, candidate_inp_bias, candidate_hid_bias] # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, array_ops.zeros_like(cu_initial_h_op), # not used opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_GRU) if is_training: (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op, cib_grad_op, chb_grad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op] + ws + bs) (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op # cudnn gru has 2 biases for reset and update gates. When converting to tf # canonical format, the two biases are summed into one. Thus here relevant # bias gradient should be halved before comparing with tf gru. cu_gb_grad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([ outputs_op, h_op, inp_grad_op, hgrad_op, (gk_grad_op, cik_grad_op, chk_grad_op), (gb_grad_op, cib_grad_op, chb_grad_op) ]) (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op, (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op), (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) # Remove the trivial 1st dimension cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "hgrad: %s" % hgrad) logging.vlog(1, "cu_hgrad: %s" % cu_hgrad) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, h = sess.run([outputs_op, h_op]) cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) # Remove the trivial 1st dimension. cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) return outputs, cu_outputs, h, cu_h
def __init__(self, vocab_size, embedding_size, batch_size, learning_rate, learning_rate_decay_op, memory_hops, dropout_rate, q_depth, a_depth, episodic_m_depth, ep_depth, attention_ff_l1_size, max_gradient_norm, maximum_story_length=5, maximum_question_length=20, use_lstm=False, forward_only=False): # initialization self.vocab_size = vocab_size self.embedding_size = embedding_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = tf.Variable( float(learning_rate_decay_op), trainable=False) self.dropout_rate = dropout_rate self.global_step = tf.Variable(0, trainable=False, name='global_step') self.q_depth = q_depth # question RNN depth self.a_depth = a_depth # answer RNN depth self.m_depth = episodic_m_depth # memory cell depth self.ep_depth = ep_depth # episodic depth self.max_gradient_norm = max_gradient_norm self.memory_hops = memory_hops # number of episodic memory pass self.m_input_size = embedding_size * 3 self.m_size = embedding_size # memory cell size self.attention_ff_l1_size = attention_ff_l1_size self.maximum_story_length = maximum_story_length print("[*] Creating Dynamic Memory Network ...") # Initializing word2vec sqrt3 = math.sqrt(3) initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3) W = tf.get_variable("embedding", [self.vocab_size, self.embedding_size], initializer=initializer) # W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]), # trainable=False, name="W") # self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size]) # self.embedding_init = W.assign(self.embedding_placeholder) # Sentence token placeholder self.story = [] story_embedded = [] for i in range(maximum_story_length): self.story.append( tf.placeholder(tf.int32, shape=[None, None], name="Story")) story_embedded.append(tf.nn.embedding_lookup(W, self.story[i])) story_embedded[i] = tf.transpose(story_embedded[i], [1, 0, 2]) self.story_len = tf.placeholder(tf.int32, shape=[1], name="Story_length") self.question = tf.placeholder(tf.int32, shape=[None, None], name="Question") question_embedded = tf.transpose( tf.nn.embedding_lookup(W, self.question), [1, 0, 2]) self.answer = tf.placeholder(tf.int64, name="answer") # configuration of attention gate answer_weights = tf.Variable(tf.truncated_normal( [self.m_size, self.vocab_size], -0.1, 0.1), name="answer_weights") answer_biases = tf.Variable(tf.zeros([self.vocab_size]), name="answer_biases") #------------ question module ------------ with tf.variable_scope("embedding_rnn"): embedding_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size) #embedding_cell = tf.nn.rnn_cell.DropoutWrapper( # embedding_cell, output_keep_prob=dropout_rate) _, self.question_state = tf.nn.dynamic_rnn(embedding_cell, question_embedded, dtype=tf.float32, time_major=True) #------------ Input module ------------ # Story_embedding_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size) # Story_embedding_cell = tf.nn.rnn_cell.DropoutWrapper( # Story_embedding_cell, output_keep_prob=dropout_rate) self.story_state_array = [] # with tf.name_scope("story_embedding_rnn"): for i in range(maximum_story_length): with tf.variable_scope("embedding_rnn", reuse=True): _, story_states = tf.nn.dynamic_rnn(embedding_cell, story_embedded[i], dtype=tf.float32, time_major=True) self.story_state_array.append(story_states) fusion_fw_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size) #fusion_fw_cell = tf.nn.rnn_cell.DropoutWrapper( # fusion_fw_cell, output_keep_prob=dropout_rate) fusion_bw_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size) #fusion_bw_cell = tf.nn.rnn_cell.DropoutWrapper( # fusion_bw_cell, output_keep_prob=dropout_rate) (self.facts_, _, _) = tf.nn.bidirectional_rnn(fusion_fw_cell, fusion_bw_cell, self.story_state_array, dtype=tf.float32) # (self.facts_, _) = tf.nn.rnn(fusion_fw_cell, self.story_state_array, sequence_length=self.story_len, dtype=tf.float32, scope='story_rnn') #------------ episodic memory module ------------ attention_ff_size = z_dim = self.embedding_size * 8 attention_ff_l2_size = 1 self.question_state_double = tf.concat( 1, [self.question_state, self.question_state]) # -------- multi-layer feedforward for multi-hop propagation ----------- self.facts = tf.concat(0, self.facts_) # ep_cell = cell.MGRUCell(self.ep_size) # mem_cell = cell.MemCell(self.m_size) mem_weights = tf.get_variable("mem_weights", [embedding_size * 4, self.m_size], initializer=tf.random_normal_initializer( 0.0, 0.5)) mem_biases = tf.get_variable( "mem_biases", [self.m_size], initializer=tf.random_normal_initializer()) l1_weights = tf.get_variable( "l1_weights", [attention_ff_size, attention_ff_l1_size], initializer=tf.random_normal_initializer()) l1_biases = tf.get_variable("l1_biases", [attention_ff_l1_size], initializer=tf.random_normal_initializer()) l2_weights = tf.get_variable( "l2_weights", [attention_ff_l1_size, attention_ff_l2_size], initializer=tf.random_normal_initializer()) l2_biases = tf.get_variable("l2_biases", [attention_ff_l2_size], initializer=tf.random_normal_initializer()) mgru_weights = {} embedding_size_double = embedding_size * 2 mgru_weights['ur_weights'] = tf.get_variable( 'ur_weights', [embedding_size_double, embedding_size_double], initializer=tf.random_normal_initializer()) mgru_weights['wr_weights'] = tf.get_variable( 'wr_weights', [embedding_size_double, embedding_size_double], initializer=tf.random_normal_initializer()) mgru_weights['wr_bias'] = tf.get_variable( 'wr_bias', [embedding_size_double], initializer=tf.random_normal_initializer()) mgru_weights['uh_weights'] = tf.get_variable( 'uh_weights', [embedding_size_double, embedding_size_double], initializer=tf.random_normal_initializer()) mgru_weights['wh_weights'] = tf.get_variable( 'wh_weights', [embedding_size_double, embedding_size_double], initializer=tf.random_normal_initializer()) mgru_weights['wh_bias'] = tf.get_variable( 'wh_bias', [embedding_size_double], initializer=tf.random_normal_initializer()) def MGRU(inputs, episodic_gates): """ modified GRU arg: """ batch_size = array_ops.shape(inputs[0])[0] state = tf.zeros([1, embedding_size_double], tf.float32) for time, (input_, episodic_gate_) in enumerate( zip(inputs, episodic_gates)): input_ = tf.reshape(input_, [1, embedding_size_double]) r = tf.sigmoid( tf.matmul(input_, mgru_weights['ur_weights']) + tf.matmul(state, mgru_weights['wr_weights']) + mgru_weights['wr_bias']) c = tf.tanh( tf.matmul(input_, mgru_weights['uh_weights']) + tf.mul(r, tf.matmul(state, mgru_weights['wh_weights'])) + mgru_weights['wh_bias']) state = tf.mul(episodic_gate_, c) + tf.mul( (1 - episodic_gate_), state) return state # episodic_gate_unpacked = [] # def condition(mem_state_previous, hops): # mem_state_previous = tf.concat(1, [mem_state_previous, mem_state_previous]) # z = tf.concat(1, [tf.mul(self.facts, self.question_state_double), tf.mul(self.facts, mem_state_previous), # tf.abs(tf.sub(self.facts, self.question_state_double)), tf.abs(tf.sub(self.facts, mem_state_previous))], name="z") # episodic_array_reshaped = tf.reshape(tf.matmul(tf.tanh(tf.matmul(z , l1_weights) + l1_biases) , l2_weights) # + l2_biases, [1,-1], name="episodic_array_reshaped") # episodic_gate = tf.nn.softmax(episodic_array_reshaped) # episodic_gate_unpacked = tf.unpack( tf.reshape(episodic_gate, [maximum_story_length,1])) # argmax_ep_gate = tf.to_int32(tf.argmax(episodic_gate, 1)) #should be 1 # # return tf.cond(tf.equal(hops,0),lambda: tf.constant(True), # # lambda: tf.logical_and(tf.less(argmax_ep_gate,self.story_len)[0],tf.less(hops,tf.constant(self.memory_hops)))) # # return tf.logical_and(tf.less(argmax_ep_gate,self.story_len)[0],tf.less(hops,tf.constant(self.memory_hops))) # return tf.less(hops,tf.constant(self.memory_hops)) # def body(mem_state_previous, hops): # # attention GRU # # outputs, context = cell.rnn_ep(ep_cell, self.facts_, episodic_gate_unpacked, dtype=tf.float32) # # outputs, context = ep_cell(ep_cell, self.facts_, episodic_gate_unpacked) # context = MGRU(self.facts_, episodic_gate_unpacked) # # memory updates # # mem_state_current = mem_cell(mem_state_previous, self.question_state, mem_state_previous, mem_weights, mem_biases, hops) # #question_state_next = question_state_prev # #print (self.question_state, mem_state_previous) # mem_state_current = tf.nn.relu(tf.matmul(tf.concat(1, [mem_state_previous, context, self.question_state]), mem_weights) + mem_biases) # hops = tf.add(hops,1) # return mem_state_current, hops mem_state_array = [] gate_array = [] mem_state = self.question_state for i in range(self.memory_hops): mem_state_double = tf.concat(1, [mem_state, mem_state]) z = tf.concat(1, [ tf.mul(self.facts, self.question_state_double), tf.mul(self.facts, mem_state_double), tf.abs(tf.sub(self.facts, self.question_state_double)), tf.abs(tf.sub(self.facts, mem_state_double)) ], name="z") episodic_array_reshaped = tf.reshape( tf.matmul(tf.tanh(tf.matmul(z, l1_weights) + l1_biases), l2_weights) + l2_biases, [1, -1], name="episodic_array_reshaped") episodic_gate = tf.nn.softmax(episodic_array_reshaped) gate_array.append(episodic_gate) episodic_gate_unpacked = tf.unpack( tf.reshape(episodic_gate, [maximum_story_length, 1])) # argmax_ep_gate = tf.to_int32(tf.argmax(episodic_gate, 1)) #should be 1 context = MGRU(self.facts_, episodic_gate_unpacked) mem_state = tf.nn.relu( tf.matmul( tf.concat(1, [mem_state, context, self.question_state]), mem_weights) + mem_biases) mem_state_array.append(mem_state) # initial_argmax_ep_gate = tf.constant(0) # initial_hops = tf.constant(0) # # initial_context = tf.constant([[0.5 for _ in range(50)]]) # mem_state, self.hops = tf.while_loop(condition,body,[self.question_state, initial_hops], back_prop=True) self.gate_array = tf.concat(0, gate_array) self.a_state = mem_state_array[-1] self.predicted_answer = tf.matmul(self.a_state, answer_weights) self.softmax_answer = tf.nn.softmax(self.predicted_answer) self.argmax_answer = tf.argmax(self.softmax_answer, 1) answer = tf.reshape(tf.one_hot(self.answer, self.vocab_size, 1.0, 0.0), [1, self.vocab_size]) self.loss = tf.nn.softmax_cross_entropy_with_logits( self.predicted_answer, answer) # self.loss = tf.nn.softmax_cross_entropy_with_logits(self.predicted_answer, answer) params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) # optimizer = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.gradient_norms = norm self.updates = optimizer.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __call__(self, x, state, scope=None): with tf.variable_scope(scope or type(self).__name__): c, h, step = state time = tf.tile(step[0], [self.num_out_ch]) batch_size = x.get_shape().as_list()[0] c = self.shape_in(c) h = self.shape_in(h) x = self.shape_in(x, True) if self.is_training: alpha = tf.constant(self.alpha, dtype=tf.float32) else: alpha = tf.constant(0, dtype=tf.float32) bias = tf.get_variable('bias', [4 * self.num_out_ch]) tau = tf.get_variable("tau", [self.num_out_ch], initializer=random_exp_initializer( 0, self.tau_init), dtype=tf.float32) s = tf.get_variable( "s", [self.num_out_ch], initializer=init_ops.random_uniform_initializer( 0., tau.initialized_value()), dtype=tf.float32) r_on = tf.get_variable("r_on", [self.num_out_ch], initializer=init_ops.constant_initializer( self.r_on_init), dtype=tf.float32, trainable=False) phi = dk_mod(dk_mod((time - s), tau) + tau, tau) / tau is_up = tf.less(phi, (r_on * 0.5)) is_down = tf.logical_and(tf.less(phi, r_on), tf.logical_not(is_up)) k = tf.where( is_up, 2. * (phi / r_on), tf.where(is_down, 2. - 2. * (phi / r_on), alpha * phi)) k = tf.reshape(k, [1, 1, 1, self.num_out_ch]) xh = conv_linear([x], self.filter_size, self.num_out_ch * 4, False, scope='xh', initializer=conv_orthogonal_initializer, init_param=None) hh = conv_linear([h], self.filter_size, self.num_out_ch * 4, False, scope='hh', initializer=conv_identity_initializer, init_param=0.95) hidden = xh + hh + bias i, j, f, o = tf.split(hidden, 4, axis=3) new_c = c * tf.nn.sigmoid(f) + tf.nn.sigmoid(i) * self.activation( j) phased_new_c = k * new_c + (1 - k) * c new_h = tf.nn.tanh(new_c) * tf.nn.sigmoid(o) phased_new_h = k * new_h + (1 - k) * h phased_new_c = self.shape_out(phased_new_c) phased_new_h = self.shape_out(phased_new_h) return phased_new_h, (phased_new_c, phased_new_h, step + 1)
def testDuplicatedInitializer(self): init = init_ops.random_uniform_initializer(0.0, 1.0) self.assertFalse(duplicated_initializer(self, init, 1))
def EUNN_param(hidden_size, capacity=2, FFT=False, comp=False): theta_phi_initializer = init_ops.random_uniform_initializer(-np.pi, np.pi) if FFT: capacity = int(np.log2(hidden_size)) params_theta_0 = vs.get_variable("theta_0", [capacity, hidden_size / 2], initializer=theta_phi_initializer) cos_theta_0 = math_ops.cos(params_theta_0) sin_theta_0 = math_ops.sin(params_theta_0) if comp: params_phi_0 = vs.get_variable("phi_0", [capacity, hidden_size / 2], initializer=theta_phi_initializer) cos_phi_0 = math_ops.cos(params_phi_0) sin_phi_0 = math_ops.sin(params_phi_0) cos_list_0_re = array_ops.concat( [cos_theta_0, math_ops.multiply(cos_theta_0, cos_phi_0)], 1) cos_list_0_im = array_ops.concat([ array_ops.zeros_like(cos_theta_0), math_ops.multiply(cos_theta_0, sin_phi_0) ], 1) sin_list_0_re = array_ops.concat( [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)], 1) sin_list_0_im = array_ops.concat([ array_ops.zeros_like(sin_theta_0), -math_ops.multiply(sin_theta_0, sin_phi_0) ], 1) cos_list_0 = array_ops.unstack( math_ops.complex(cos_list_0_re, cos_list_0_im)) sin_list_0 = array_ops.unstack( math_ops.complex(sin_list_0_re, sin_list_0_im)) else: cos_list_0 = array_ops.unstack( array_ops.concat([cos_theta_0, cos_theta_0], 1)) sin_list_0 = array_ops.unstack( array_ops.concat([sin_theta_0, -sin_theta_0], 1)) ind, ind1 = permute_FFT(hidden_size) ind1_list = array_ops.unstack(ind1) diag_list_0 = [] off_list_0 = [] for i in range(capacity): diag_list_0.append(permute(cos_list_0[i], ind1_list[i])) off_list_0.append(permute(sin_list_0[i], ind1_list[i])) v1 = array_ops.stack(diag_list_0, 0) v2 = array_ops.stack(off_list_0, 0) else: params_theta_0 = vs.get_variable( "theta_0", [int(capacity / 2), int(hidden_size / 2)], initializer=theta_phi_initializer) cos_theta_0 = math_ops.cos(params_theta_0) sin_theta_0 = math_ops.sin(params_theta_0) if comp: params_phi_0 = vs.get_variable( "phi_0", [int(capacity / 2), int(hidden_size / 2)], initializer=theta_phi_initializer) cos_phi_0 = math_ops.cos(params_phi_0) sin_phi_0 = math_ops.sin(params_phi_0) cos_list_0_re = array_ops.concat( [cos_theta_0, math_ops.multiply(cos_theta_0, cos_phi_0)], 1) cos_list_0_im = array_ops.concat([ array_ops.zeros_like(cos_theta_0), math_ops.multiply(cos_theta_0, sin_phi_0) ], 1) sin_list_0_re = array_ops.concat( [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)], 1) sin_list_0_im = array_ops.concat([ array_ops.zeros_like(sin_theta_0), -math_ops.multiply(sin_theta_0, sin_phi_0) ], 1) cos_list_0 = array_ops.unstack( math_ops.complex(cos_list_0_re, cos_list_0_im)) sin_list_0 = array_ops.unstack( math_ops.complex(sin_list_0_re, sin_list_0_im)) else: cos_list_0 = array_ops.concat([cos_theta_0, cos_theta_0], 1) sin_list_0 = array_ops.concat([sin_theta_0, -sin_theta_0], 1) params_theta_1 = vs.get_variable( "theta_1", [int(capacity / 2), int(hidden_size / 2) - 1], initializer=theta_phi_initializer) cos_theta_1 = math_ops.cos(params_theta_1) sin_theta_1 = math_ops.sin(params_theta_1) if comp: params_phi_1 = vs.get_variable( "phi_1", [int(capacity / 2), int(hidden_size / 2) - 1], initializer=theta_phi_initializer) cos_phi_1 = math_ops.cos(params_phi_1) sin_phi_1 = math_ops.sin(params_phi_1) cos_list_1_re = array_ops.concat([ np.ones((int(capacity / 2), 1)), cos_theta_1, math_ops.multiply(cos_theta_1, cos_phi_1), np.ones((int(capacity / 2), 1)) ], 1) cos_list_1_im = array_ops.concat([ np.zeros((int(capacity / 2), 1)), array_ops.zeros_like(cos_theta_1), math_ops.multiply(cos_theta_1, sin_phi_1), np.zeros((int(capacity / 2), 1)) ], 1) sin_list_1_re = array_ops.concat([ np.zeros((int(capacity / 2), 1)), sin_theta_1, -math_ops.multiply(sin_theta_1, cos_phi_1), np.zeros((int(capacity / 2), 1)) ], 1) sin_list_1_im = array_ops.concat([ np.zeros((int(capacity / 2), 1)), array_ops.zeros_like(sin_theta_1), -math_ops.multiply(sin_theta_1, sin_phi_1), np.zeros((int(capacity / 2), 1)) ], 1) cos_list_1 = array_ops.unstack( math_ops.complex(cos_list_1_re, cos_list_1_im)) sin_list_1 = array_ops.unstack( math_ops.complex(sin_list_1_re, sin_list_1_im)) else: cos_list_1 = array_ops.concat([ np.ones((int(capacity / 2), 1)), cos_theta_1, cos_theta_1, np.ones((int(capacity / 2), 1)) ], 1) sin_list_1 = array_ops.concat([ np.zeros((int(capacity / 2), 1)), sin_theta_1, -sin_theta_1, np.zeros((int(capacity / 2), 1)) ], 1) ind, ind3, ind4 = permute_tunable(hidden_size, capacity) diag_list_0 = permute(cos_list_0, ind3) off_list_0 = permute(sin_list_0, ind3) diag_list_1 = permute(cos_list_1, ind4) off_list_1 = permute(sin_list_1, ind4) v1 = tf.reshape(tf.concat([diag_list_0, diag_list_1], 1), [capacity, hidden_size]) v2 = tf.reshape(tf.concat([off_list_0, off_list_1], 1), [capacity, hidden_size]) if comp: omega = vs.get_variable("omega", [hidden_size], initializer=theta_phi_initializer) D = math_ops.complex(math_ops.cos(omega), math_ops.sin(omega)) else: D = None v1 = toTensorArray(v1) v2 = toTensorArray(v2) ind = toTensorArray(ind) diag = D return v1, v2, ind, diag, capacity
def __call__(self, inputs, state, scope=None): """ Phased long short-term memory cell (P-LSTM).""" with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. c_prev, h_prev = state # (batch_size, seq_len, 2) # NB: here we explicitly give t as input. x = tf.reshape(inputs[:, 0], (-1, 1)) t = inputs[:, 1][-1] # Now we only accept one id. We have a batch so it's a bit more complex. # maybe the information should come from the outside. To be defined later. concat = _linear([x, h_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) dtype = inputs.dtype tau = vs.get_variable('tau', shape=[self._num_units], initializer=random_exp_initializer(0, self.tau_init), dtype=dtype) r_on = vs.get_variable('r_on', shape=[self._num_units], initializer=init_ops.constant_initializer(self.r_on_init), dtype=dtype) s = vs.get_variable('s', shape=[self._num_units], initializer=init_ops.random_uniform_initializer(0., tau.initialized_value()), dtype=dtype) times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units]) phase = phi(times, s, tau) kappa = time_gate_fast(phase, r_on, self._leak_rate, self._training_phase) w_o_peephole = None if self._use_peepholes: w_i_peephole = vs.get_variable('W_I_peephole', shape=[self._num_units], dtype=dtype) w_f_peephole = vs.get_variable('W_F_peephole', shape=[self._num_units], dtype=dtype) w_o_peephole = vs.get_variable('W_O_peephole', shape=[self._num_units], dtype=dtype) f += w_f_peephole * c_prev i += w_i_peephole * c_prev new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation(j) if self._use_peepholes: o += w_o_peephole * new_c_tilde new_h_tilde = sigmoid(o) * self._activation(new_c_tilde) """ Hi all, Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c. I can add a point to the paper to mention that, and will update Figure 1 so the line is correctly drawn to c_tilde instead. The intuition here is that the gates should be blind to the effect of the khronos gate; input, forget and output gate should all operate as if the cell were a normal LSTM cell, while the khronos gate allows it to either operate or not operate (and then linearly interpolates between these two states). If the output gate is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation. I think everyone's right in that it wouldn't influence much -- but it should be updated in the paper. Thanks very much for pointing out the issue, Philippe! -Danny""" # Apply Khronos gate new_h = kappa * new_h_tilde + (1 - kappa) * h_prev new_c = kappa * new_c_tilde + (1 - kappa) * c_prev new_state = (new_c, new_h) return new_h, new_state