def value_function(self): assert self.cur_instance, "must call forward first" with self._branch_variable_scope("value_function"): # Simple case: sharing the feature layer if self.model_config["vf_share_layers"]: return tf.reshape( linear(self.cur_instance.last_layer, 1, "value_function", normc_initializer(1.0)), [-1]) # Create a new separate model with no RNN state, etc. branch_model_config = self.model_config.copy() branch_model_config["free_log_std"] = False if branch_model_config["use_lstm"]: branch_model_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") branch_instance = self.legacy_model_cls( self.cur_instance.input_dict, self.obs_space, self.action_space, 1, branch_model_config, state_in=None, seq_lens=None) return tf.reshape(branch_instance.outputs, [-1])
def _build_layers_v2(self, input_dict, num_outputs, options): # Hard deprecate this class. All Models should use the ModelV2 # API from here on. deprecation_warning("Model->LSTM", "RecurrentNetwork", error=False) cell_size = options.get("lstm_cell_size") if options.get("lstm_use_prev_action_reward"): action_dim = int( np.product( input_dict["prev_actions"].get_shape().as_list()[1:])) features = tf.concat( [ input_dict["obs"], tf.reshape( tf.cast(input_dict["prev_actions"], tf.float32), [-1, action_dim]), tf.reshape(input_dict["prev_rewards"], [-1, 1]), ], axis=1) else: features = input_dict["obs"] last_layer = add_time_dimension(features, self.seq_lens) # Setup the LSTM cell lstm = tf1.nn.rnn_cell.LSTMCell(cell_size, state_is_tuple=True) self.state_init = [ np.zeros(lstm.state_size.c, np.float32), np.zeros(lstm.state_size.h, np.float32) ] # Setup LSTM inputs if self.state_in: c_in, h_in = self.state_in else: c_in = tf1.placeholder( tf.float32, [None, lstm.state_size.c], name="c") h_in = tf1.placeholder( tf.float32, [None, lstm.state_size.h], name="h") self.state_in = [c_in, h_in] # Setup LSTM outputs state_in = tf1.nn.rnn_cell.LSTMStateTuple(c_in, h_in) lstm_out, lstm_state = tf1.nn.dynamic_rnn( lstm, last_layer, initial_state=state_in, sequence_length=self.seq_lens, time_major=False, dtype=tf.float32) self.state_out = list(lstm_state) # Compute outputs last_layer = tf.reshape(lstm_out, [-1, cell_size]) logits = linear(last_layer, num_outputs, "action", normc_initializer(0.01)) return logits, last_layer
def value_function(self): """Builds the value function output. This method can be overridden to customize the implementation of the value function (e.g., not sharing hidden layers). Returns: Tensor of size [BATCH_SIZE] for the value function. """ return tf.reshape( linear(self.last_layer, 1, "value", normc_initializer(1.0)), [-1])
def value_function(self): assert self.cur_instance is not None, "must call forward first" with tf1.variable_scope(self.variable_scope): with tf1.variable_scope("value_function", reuse=tf1.AUTO_REUSE): # Simple case: sharing the feature layer if self.model_config["vf_share_layers"]: return tf.reshape( linear(self.cur_instance.last_layer, 1, "value_function", normc_initializer(1.0)), [-1]) # Create a new separate model with no RNN state, etc. branch_model_config = self.model_config.copy() branch_model_config["free_log_std"] = False obs_space_vf = self.obs_space if branch_model_config["use_lstm"]: branch_model_config["use_lstm"] = False logger.warning( "It is not recommended to use an LSTM model " "with the `vf_share_layers=False` option. " "If you want to use separate policy- and vf-" "networks with LSTMs, you can implement a custom " "LSTM model that overrides the value_function() " "method. " "NOTE: Your policy- and vf-NNs will use the same " "shared LSTM!") # Remove original space from obs-space not to trigger # preprocessing (input to vf-NN is already vectorized # LSTM output). obs_space_vf = copy.copy(self.obs_space) if hasattr(obs_space_vf, "original_space"): delattr(obs_space_vf, "original_space") branch_instance = self.legacy_model_cls( self.cur_instance.input_dict, obs_space_vf, self.action_space, 1, branch_model_config, state_in=None, seq_lens=None) return tf.reshape(branch_instance.outputs, [-1])
def _build_layers_v2(self, input_dict, num_outputs, options): # Previously, a new class object was created during # deserialization and this `capture_index` # variable would be refreshed between class instantiations. # This behavior is no longer the case, so we manually refresh # the variable. RNNSpyModel.capture_index = 0 def spy(sequences, state_in, state_out, seq_lens): if len(sequences) == 1: return 0 # don't capture inference inputs # TF runs this function in an isolated context, so we have to use # redis to communicate back to our suite ray.experimental.internal_kv._internal_kv_put( "rnn_spy_in_{}".format(RNNSpyModel.capture_index), pickle.dumps({ "sequences": sequences, "state_in": state_in, "state_out": state_out, "seq_lens": seq_lens }), overwrite=True) RNNSpyModel.capture_index += 1 return 0 features = input_dict["obs"] cell_size = 3 last_layer = add_time_dimension(features, self.seq_lens) # Setup the LSTM cell lstm = tf.nn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True) self.state_init = [ np.zeros(lstm.state_size.c, np.float32), np.zeros(lstm.state_size.h, np.float32) ] # Setup LSTM inputs if self.state_in: c_in, h_in = self.state_in else: c_in = tf.placeholder(tf.float32, [None, lstm.state_size.c], name="c") h_in = tf.placeholder(tf.float32, [None, lstm.state_size.h], name="h") self.state_in = [c_in, h_in] # Setup LSTM outputs state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in) lstm_out, lstm_state = tf.nn.dynamic_rnn(lstm, last_layer, initial_state=state_in, sequence_length=self.seq_lens, time_major=False, dtype=tf.float32) self.state_out = list(lstm_state) spy_fn = tf.py_func(spy, [ last_layer, self.state_in, self.state_out, self.seq_lens, ], tf.int64, stateful=True) # Compute outputs with tf.control_dependencies([spy_fn]): last_layer = tf.reshape(lstm_out, [-1, cell_size]) logits = linear(last_layer, num_outputs, "action", normc_initializer(0.01)) return logits, last_layer