def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model(np.zeros((10, 3), dtype=np.float32), 5) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( np.zeros((10, 84, 84, 3), dtype=np.float32), 5) self.assertEqual(type(p2), VisionNetwork)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( get_registry(), np.zeros((10, 3), dtype=np.float32), 5) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5) self.assertEqual(type(p2), VisionNetwork)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( get_registry(), np.zeros((10, 3), dtype=np.float32), 5) assert type(p1) == FullyConnectedNetwork with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5) assert type(p2) == VisionNetwork
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model({ "obs": tf.zeros((10, 3), dtype=tf.float32) }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {}) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model({ "obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32) }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {}) self.assertEqual(type(p2), VisionNetwork)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( {"obs": tf.zeros((10, 3), dtype=tf.float32)}, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {}) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( {"obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32)}, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), Discrete(5), 5, {}) self.assertEqual(type(p2), VisionNetwork)
def _build_q_network(registry, inputs, num_actions, config): dueling = config["dueling"] hiddens = config["hiddens"] frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) frontend_out = frontend.last_layer with tf.variable_scope("action_value"): action_out = frontend_out for hidden in hiddens: action_out = layers.fully_connected( action_out, num_outputs=hidden, activation_fn=tf.nn.relu) action_scores = layers.fully_connected( action_out, num_outputs=num_actions, activation_fn=None) if dueling: with tf.variable_scope("state_value"): state_out = frontend_out for hidden in hiddens: state_out = layers.fully_connected( state_out, num_outputs=hidden, activation_fn=tf.nn.relu) state_score = layers.fully_connected( state_out, num_outputs=1, activation_fn=None) action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_centered = action_scores - tf.expand_dims( action_scores_mean, 1) return state_score + action_scores_centered else: return action_scores
def _build_value_network(self, obs, obs_space): value_model = ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]) return value_model.outputs
def __init__(self, registry, sess, action_space, preprocessor, observation_filter): self.sess = sess self.action_space = action_space self.preprocessor = preprocessor self.observation_filter = get_filter( observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder( tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, dist_type="deterministic") model = ModelCatalog.get_model(registry, self.inputs, dist_dim, options={"fcnet_hiddens": [32, 32]}) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum([np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()]) self.sess.run(tf.global_variables_initializer())
def _build_critic_network( self, obs_n, act_n, obs_space_n, act_space_n, use_state_preprocessor, hiddens, activation=None, scope=None, ): """ Build critic network Args: obs_n: list, the observation placeholder list contains at least one. act_n: list, the action placeholder list contains at least one. obs_space_n: list, the observation space list contains at least one. act_space_n: list, the action space list contains at least one. use_state_preprocessor: bool, if true, there are `n` preprocessor models for each observation placeholder otherwise, no. hiddens: list, a list of unit definition. activation: tf.nn, default is None, to initialize the activation function. scope: str, name the variable scope Returns: out: tf.Tensor, logits out. feature: tf.Tensor, intputs of logits output. model_n: list, preprocessor models for observation inputs. variables: list, return global variables of this critic network. """ with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: if use_state_preprocessor: model_n = [ ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, act_space, 1, self.config["model"], ) for obs, obs_space, act_space in zip( obs_n, obs_space_n, act_space_n ) ] out_n = [model.last_layer for model in model_n] out = tf.concat(out_n + act_n, axis=1) else: model_n = [None] * len(obs_n) out = tf.concat(obs_n + act_n, axis=1) for hidden in hiddens: out = tf1.layers.dense(out, units=hidden, activation=activation) feature = out out = tf1.layers.dense(feature, units=1, activation=None) return out, feature, model_n, tf1.global_variables(scope.name)
def _build_q_network(inputs, num_actions, config): dueling = config["dueling"] hiddens = config["hiddens"] frontend = ModelCatalog.get_model(inputs, 1, config["model"]) frontend_out = frontend.last_layer with tf.variable_scope("action_value"): action_out = frontend_out for hidden in hiddens: action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=tf.nn.relu) action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None) if dueling: with tf.variable_scope("state_value"): state_out = frontend_out for hidden in hiddens: state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=tf.nn.relu) state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None) action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_centered = action_scores - tf.expand_dims( action_scores_mean, 1) return state_score + action_scores_centered else: return action_scores
def _build_critic_network(self, obs_n, act_n, obs_space_n, act_space_n, use_state_preprocessor, hiddens, activation=None, scope=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope: if use_state_preprocessor: model_n = [ ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, act_space, 1, self.config["model"]) for obs, obs_space, act_space in zip( obs_n, obs_space_n, act_space_n) ] out_n = [model.last_layer for model in model_n] out = tf.concat(out_n + act_n, axis=1) else: model_n = [None] * len(obs_n) out = tf.concat(obs_n + act_n, axis=1) for hidden in hiddens: out = tf.layers.dense(out, units=hidden, activation=activation) feature = out out = tf.layers.dense(feature, units=1, activation=None) return out, feature, model_n, tf.global_variables(scope.name)
def __init__(self, obs_space, action_space, config): self.action_space = action_space self.action_noise_std = config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space) self.observation_filter = get_filter(config["observation_filter"], self.preprocessor.shape) self.single_threaded = config.get("single_threaded", False) self.sess = make_session(single_threaded=self.single_threaded) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, config["model"], dist_type="deterministic") model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs}, obs_space, action_space, dist_dim, config["model"]) dist = dist_class(model.outputs, model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def _build_q_network(self, obs, obs_space, actions): return QNetwork( ModelCatalog.get_model({ "obs": obs }, obs_space, 1, self.config["model"]), actions, self.config["critic_hiddens"], self.config["critic_hidden_activation"]).value
def __init__(self, observation_space, action_space, observations, advantages, actions, prev_logits, logit_dim, kl_coeff, distribution_class, config, sess): assert (isinstance(action_space, gym.spaces.Discrete) or isinstance(action_space, gym.spaces.Box)) self.prev_dist = distribution_class(prev_logits) # Saved so that we can compute actions given different observations self.observations = observations self.curr_logits = ModelCatalog.get_model(observations, logit_dim, config["model"]).outputs self.curr_dist = distribution_class(self.curr_logits) self.sampler = self.curr_dist.sample() # Make loss functions. self.ratio = tf.exp( self.curr_dist.logp(actions) - self.prev_dist.logp(actions)) self.kl = self.prev_dist.kl(self.curr_dist) self.mean_kl = tf.reduce_mean(self.kl) self.entropy = self.curr_dist.entropy() self.mean_entropy = tf.reduce_mean(self.entropy) self.surr1 = self.ratio * advantages self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"], 1 + config["clip_param"]) * advantages self.surr = tf.minimum(self.surr1, self.surr2) self.loss = tf.reduce_mean(-self.surr + kl_coeff * self.kl - config["entropy_coeff"] * self.entropy) self.sess = sess
def __init__(self, sess, action_space, preprocessor, observation_filter, action_noise_std): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor if observation_filter == "MeanStdFilter": self.observation_filter = MeanStdFilter(self.preprocessor.shape, clip=None) elif observation_filter == "NoFilter": self.observation_filter = NoFilter() else: raise Exception("Unknown observation_filter: " + str("observation_filter")) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, dist_type="deterministic") model = ModelCatalog.get_model(self.inputs, dist_dim) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum([ np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items() ]) self.sess.run(tf.global_variables_initializer())
def __init__(self, sess, action_space, obs_space, preprocessor, observation_filter, model_config, action_noise_std=0.0): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, model_config, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, sess, action_space, obs_space, preprocessor, observation_filter, model_config, action_noise_std=0.0): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, model_config, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, action_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def _build_q_network(self, obs): qnet = QNetwork( ModelCatalog.get_model(obs, 1, self.config["model"]), self.num_actions, self.config["dueling"], self.config["hiddens"], self.config["noisy"], self.config["num_atoms"], self.config["v_min"], self.config["v_max"], self.config["sigma0"]) return qnet.value, qnet.logits, qnet.dist
def _build_policy_network(self, obs, obs_space, action_space): if self.config["use_state_preprocessor"]: model = ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, 1, self.config["model"]) action_out = model.last_layer else: model = None action_out = obs activation = getattr(tf.nn, self.config["actor_hidden_activation"]) for hidden in self.config["actor_hiddens"]: action_out = tf.layers.dense(action_out, units=hidden, activation=activation) if self.config["parameter_noise"]: action_out = tf.keras.layers.LayerNormalization()(action_out) action_out = tf.layers.dense(action_out, units=action_space.shape[0], activation=None) # Use sigmoid to scale to [0,1], but also double magnitude of input to # emulate behaviour of tanh activation used in DDPG and TD3 papers. sigmoid_out = tf.nn.sigmoid(2 * action_out) # Rescale to actual env policy scale # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to # get same dims) action_range = (action_space.high - action_space.low)[None] low_action = action_space.low[None] actions = action_range * sigmoid_out + low_action return actions, model
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({"obs": tf.constant([1, 2, 3])}, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std): self.ac_space = ac_space self.ac_noise_std = ac_noise_std self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape) with tf.variable_scope(type(self).__name__) as scope: # Observation normalization. ob_mean = tf.get_variable( 'ob_mean', self.preprocessor_shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) ob_std = tf.get_variable( 'ob_std', self.preprocessor_shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) in_mean = tf.placeholder(tf.float32, self.preprocessor_shape) in_std = tf.placeholder(tf.float32, self.preprocessor_shape) self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[ tf.assign(ob_mean, in_mean), tf.assign(ob_std, in_std), ]) inputs = tf.placeholder( tf.float32, [None] + list(self.preprocessor_shape)) # TODO(ekl): we should do clipping in a standard RLlib preprocessor clipped_inputs = tf.clip_by_value( (inputs - ob_mean) / ob_std, -5.0, 5.0) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.ac_space, dist_type='deterministic') model = ModelCatalog.get_model(clipped_inputs, dist_dim) dist = dist_class(model.outputs) self._act = U.function([inputs], dist.sample()) return scope
def _build_actor_network(self, obs, obs_space, act_space, use_state_preprocessor, hiddens, activation=None, scope=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope: if use_state_preprocessor: model = ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, act_space, 1, self.config["model"]) out = model.last_layer else: model = None out = obs for hidden in hiddens: out = tf.layers.dense(out, units=hidden, activation=activation) feature = tf.layers.dense(out, units=act_space.shape[0], activation=None) sampler = tfp.distributions.RelaxedOneHotCategorical( temperature=1.0, logits=feature).sample() return sampler, feature, model, tf.global_variables(scope.name)
def _build_p_network(self, obs, obs_space): return PNetwork( ModelCatalog.get_model({ "obs": obs }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"]).action_scores
def _build_p_network(self, obs, obs_space): return PNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"]).action_scores
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
def _build_q_network(self, obs, obs_space, action_space, actions): q_net = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, 1, self.config["model"]), actions, self.config["critic_hiddens"], self.config["critic_hidden_activation"]) return q_net.value, q_net.model
def _build_q_network(self, obs, obs_space, actions): q_net = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), actions, self.config["critic_hiddens"], self.config["critic_hidden_activation"]) return q_net.value, q_net.model
def _build_p_network(self, obs, obs_space): policy_net = PNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"]) return policy_net.action_scores, policy_net.model
def _build_q_network(self, obs, space): qnet = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, space, self.num_actions, self.config["model"]), self.num_actions, self.config["dueling"], self.config["hiddens"], self.config["noisy"], self.config["num_atoms"], self.config["v_min"], self.config["v_max"], self.config["sigma0"]) return qnet.value, qnet.logits, qnet.dist, qnet.model
def _build_p_network(self, obs, obs_space): policy_net = PNetwork( ModelCatalog.get_model( { "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"], self.config["parameter_noise"]) return policy_net.action_scores, policy_net.model
def _build_q_network(policy, obs, obs_space, action_space): config = policy.config qnet = QNetwork( ModelCatalog.get_model( { "obs": obs, "is_training": policy._get_is_training_placeholder(), }, obs_space, action_space, action_space.n, config["model"]), action_space.n, config["dueling"], config["hiddens"], config["noisy"], config["num_atoms"], config["v_min"], config["v_max"], config["sigma0"], config["parameter_noise"]) return qnet.value, qnet.logits, qnet.dist, qnet.model
def _build_q_network(inputs, action_inputs, config): frontend = ModelCatalog.get_model(inputs, 1, config["model"]) hiddens = config["critic_hiddens"] q_out = tf.concat([frontend.last_layer, action_inputs], axis=1) for hidden in hiddens: q_out = layers.fully_connected(q_out, num_outputs=hidden, activation_fn=tf.nn.relu) q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None) return q_scores
def _build_q_network(self, obs, obs_space, action_space, actions): if self.config["use_state_preprocessor"]: q_model = ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, 1, self.config["model"]) q_out = tf.concat([q_model.last_layer, actions], axis=1) else: q_model = None q_out = tf.concat([obs, actions], axis=1) activation = getattr(tf.nn, self.config["critic_hidden_activation"]) for hidden in self.config["critic_hiddens"]: q_out = tf.layers.dense(q_out, units=hidden, activation=activation) q_values = tf.layers.dense(q_out, units=1, activation=None) return q_values, q_model
def _build_p_network(inputs, dim_actions, config): """ map an observation (i.e., state) to an action where each entry takes value from (0, 1) due to the sigmoid function """ frontend = ModelCatalog.get_model(inputs, 1, config["model"]) hiddens = config["actor_hiddens"] action_out = frontend.last_layer for hidden in hiddens: action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=tf.nn.relu) # Use sigmoid layer to bound values within (0, 1) # shape of action_scores is [batch_size, dim_actions] action_scores = layers.fully_connected(action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid) return action_scores
def _build_actor_network(self, obs, obs_space, act_space, use_state_preprocessor, hiddens, activation=None, scope=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope: if use_state_preprocessor: model = ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, act_space, 1, self.config["model"]) out = model.last_layer else: model = None out = obs for hidden in hiddens: out = tf.layers.dense(out, units=hidden, activation=activation) feature = tf.layers.dense( out, units=act_space.shape[0], activation=None) """ sampler = tfp.distributions.RelaxedOneHotCategorical( temperature=1.0, logits=feature).sample() """ # Use sigmoid to scale to [0,1], but also double magnitude of input to # emulate behaviour of tanh activation used in DDPG and TD3 papers. sigmoid_out = tf.nn.sigmoid(2 * feature) # Rescale to actual env policy scale # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to # get same dims) action_range = (act_space.high - act_space.low)[None] low_action = act_space.low[None] actions = action_range * sigmoid_out + low_action return actions, feature, model, tf.global_variables(scope.name)
def __init__(self, sess, action_space, preprocessor, observation_filter, action_noise_std, options={}): if len(preprocessor.shape) > 1: raise UnsupportedSpaceException( "Observation space {} is not supported with ARS.".format( preprocessor.shape)) self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, dist_type="deterministic") model = ModelCatalog.get_model(self.inputs, dist_dim, options=options) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") with tf.variable_scope(POLICY_SCOPE) as scope: self.model = ModelCatalog.get_model( { "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(VALUE_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicy self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (Postprocessing.ADVANTAGES, self.cum_rew_t), ] TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") with tf.variable_scope(P_SCOPE) as scope: self.model = ModelCatalog.get_model({ "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = ( self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__( self, observation_space, action_space, observations, value_targets, advantages, actions, prev_logits, prev_vf_preds, logit_dim, kl_coeff, distribution_class, config, sess, registry): self.prev_dist = distribution_class(prev_logits) # Saved so that we can compute actions given different observations self.observations = observations self.curr_logits = ModelCatalog.get_model( registry, observations, logit_dim, config["model"]).outputs self.curr_dist = distribution_class(self.curr_logits) self.sampler = self.curr_dist.sample() if config["use_gae"]: vf_config = config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( registry, observations, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) # Make loss functions. self.ratio = tf.exp(self.curr_dist.logp(actions) - self.prev_dist.logp(actions)) self.kl = self.prev_dist.kl(self.curr_dist) self.mean_kl = tf.reduce_mean(self.kl) self.entropy = self.curr_dist.entropy() self.mean_entropy = tf.reduce_mean(self.entropy) self.surr1 = self.ratio * advantages self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"], 1 + config["clip_param"]) * advantages self.surr = tf.minimum(self.surr1, self.surr2) self.mean_policy_loss = tf.reduce_mean(-self.surr) if config["use_gae"]: # We use a huber loss here to be more robust against outliers, # which seem to occur when the rollouts get longer (the variance # scales superlinearly with the length of the rollout) self.vf_loss1 = tf.square(self.value_function - value_targets) vf_clipped = prev_vf_preds + tf.clip_by_value( self.value_function - prev_vf_preds, -config["clip_param"], config["clip_param"]) self.vf_loss2 = tf.square(vf_clipped - value_targets) self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2) self.mean_vf_loss = tf.reduce_mean(self.vf_loss) self.loss = tf.reduce_mean( -self.surr + kl_coeff * self.kl + config["vf_loss_coeff"] * self.vf_loss - config["entropy_coeff"] * self.entropy) else: self.mean_vf_loss = tf.constant(0.0) self.loss = tf.reduce_mean( -self.surr + kl_coeff * self.kl - config["entropy_coeff"] * self.entropy) self.sess = sess if config["use_gae"]: self.policy_results = [ self.sampler, self.curr_logits, self.value_function] else: self.policy_results = [ self.sampler, self.curr_logits, tf.constant("NA")]
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model( get_registry(), 1, 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))