예제 #1
0
    def __init__(self, obs_space, action_space, registry, config):
        self.config = config

        # setup policy
        self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        self.model = ModelCatalog.get_model(
            registry, self.x, self.logit_dim, options=self.config["model"])
        self.dist = dist_class(self.model.outputs)  # logit for each action

        # setup policy loss
        self.ac = ModelCatalog.get_action_placeholder(action_space)
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_in = [
            ("obs", self.x),
            ("actions", self.ac),
            ("advantages", self.adv),
        ]
        self.is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(
            self, self.sess, obs_input=self.x,
            action_sampler=self.dist.sample(), loss=self.loss,
            loss_inputs=self.loss_in, is_training=self.is_training)
        self.sess.run(tf.global_variables_initializer())
예제 #2
0
    def __init__(self, ob_space, action_space, config):
        self.local_steps = 0
        self.config = config
        self.summarize = config.get("summarize")

        self._setup_graph(ob_space, action_space)
        assert all(
            hasattr(self, attr) for attr in ["vf", "logits", "x", "var_list"])
        print("Setting up loss")
        self.setup_loss(action_space)
        self.is_training = tf.placeholder_with_default(True, ())
        self.sess = tf.get_default_session()

        TFPolicyGraph.__init__(self,
                               self.sess,
                               obs_input=self.x,
                               action_sampler=self.action_dist.sample(),
                               loss=self.loss,
                               loss_inputs=self.loss_in,
                               is_training=self.is_training,
                               state_inputs=self.state_in,
                               state_outputs=self.state_out)

        self.sess.run(tf.global_variables_initializer())

        if self.summarize:
            bs = tf.to_float(tf.shape(self.x)[0])
            tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
            tf.summary.scalar("model/value_loss", self.vf_loss / bs)
            tf.summary.scalar("model/entropy", self.entropy / bs)
            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
            self.summary_op = tf.summary.merge_all()
예제 #3
0
    def __init__(self, obs_space, action_space, config):
        config = dict(ray.rllib.pg.pg.DEFAULT_CONFIG, **config)
        self.config = config

        # Setup policy
        obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
        dist_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        self.model = ModelCatalog.get_model(obs,
                                            self.logit_dim,
                                            options=self.config["model"])
        action_dist = dist_class(self.model.outputs)  # logit for each action

        # Setup policy loss
        actions = ModelCatalog.get_action_placeholder(action_space)
        advantages = tf.placeholder(tf.float32, [None], name="adv")
        loss = PGLoss(action_dist, actions, advantages).loss

        # Initialize TFPolicyGraph
        sess = tf.get_default_session()
        loss_in = [
            ("obs", obs),
            ("actions", actions),
            ("advantages", advantages),
        ]

        # LSTM support
        for i, ph in enumerate(self.model.state_in):
            loss_in.append(("state_in_{}".format(i), ph))

        is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(self,
                               obs_space,
                               action_space,
                               sess,
                               obs_input=obs,
                               action_sampler=action_dist.sample(),
                               loss=loss,
                               loss_inputs=loss_in,
                               is_training=is_training,
                               state_inputs=self.model.state_in,
                               state_outputs=self.model.state_out,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=config["model"]["max_seq_len"])
        sess.run(tf.global_variables_initializer())
예제 #4
0
    def __init__(self, observation_space, action_space, config):
        if not isinstance(action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    action_space))

        self.config = config
        self.cur_epsilon = 1.0
        dim_actions = action_space.shape[0]
        low_action = action_space.low
        high_action = action_space.high
        self.actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["actor_lr"])
        self.critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["critic_lr"])

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
        self.cur_observations = tf.placeholder(tf.float32,
                                               shape=(None, ) +
                                               observation_space.shape)

        # Actor: P (policy) network
        with tf.variable_scope(P_SCOPE) as scope:
            p_values = _build_p_network(self.cur_observations, dim_actions,
                                        config)
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE):
            self.output_actions = _build_action_network(
                p_values, low_action, high_action, self.stochastic, self.eps,
                config["exploration_theta"], config["exploration_sigma"])

        with tf.variable_scope(A_SCOPE, reuse=True):
            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
            self.reset_noise_op = tf.assign(exploration_sample,
                                            dim_actions * [.0])

        # Replay inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape,
                                    name="observation")
        self.act_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + action_space.shape,
                                    name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(tf.float32,
                                      shape=(None, ) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(tf.float32, [None],
                                                 name="weight")

        # p network evaluation
        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
            self.p_t = _build_p_network(self.obs_t, dim_actions, config)

        # target p network evaluation
        with tf.variable_scope(P_TARGET_SCOPE) as scope:
            p_tp1 = _build_p_network(self.obs_tp1, dim_actions, config)
            target_p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE, reuse=True):
            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
            zero_eps = tf.constant(value=.0, dtype=tf.float32)
            output_actions = _build_action_network(self.p_t, low_action,
                                                   high_action,
                                                   deterministic_flag,
                                                   zero_eps,
                                                   config["exploration_theta"],
                                                   config["exploration_sigma"])

            output_actions_estimated = _build_action_network(
                p_tp1, low_action, high_action, deterministic_flag, zero_eps,
                config["exploration_theta"], config["exploration_sigma"])

        # q network evaluation
        with tf.variable_scope(Q_SCOPE) as scope:
            q_t = _build_q_network(self.obs_t, self.act_t, config)
            self.q_func_vars = _scope_vars(scope.name)
        with tf.variable_scope(Q_SCOPE, reuse=True):
            q_tp0 = _build_q_network(self.obs_t, output_actions, config)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated,
                                     config)
            target_q_func_vars = _scope_vars(scope.name)

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = (
            self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)

        # compute the error (potentially clipped)
        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        if config.get("use_huber"):
            errors = _huber_loss(self.td_error, config.get("huber_threshold"))
        else:
            errors = 0.5 * tf.square(self.td_error)

        self.loss = tf.reduce_mean(self.importance_weights * errors)

        # for policy gradient
        self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)

        if config["l2_reg"] is not None:
            for var in self.p_func_vars:
                if "bias" not in var.name:
                    self.actor_loss += (config["l2_reg"] * 0.5 *
                                        tf.nn.l2_loss(var))
            for var in self.q_func_vars:
                if "bias" not in var.name:
                    self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        self.tau_value = config.get("tau")
        self.tau = tf.placeholder(tf.float32, (), name="tau")
        update_target_expr = []
        for var, var_target in zip(
                sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        for var, var_target in zip(
                sorted(self.p_func_vars, key=lambda v: v.name),
                sorted(target_p_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        self.update_target_expr = tf.group(*update_target_expr)

        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("rewards", self.rew_t),
            ("new_obs", self.obs_tp1),
            ("dones", self.done_mask),
            ("weights", self.importance_weights),
        ]
        self.is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(self,
                               self.sess,
                               obs_input=self.cur_observations,
                               action_sampler=self.output_actions,
                               loss=self.loss,
                               loss_inputs=self.loss_inputs,
                               is_training=self.is_training)
        self.sess.run(tf.global_variables_initializer())

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(q_tp0, q_tp1), self.sess)

        # Hard initial update
        self.update_target(tau=1.0)
예제 #5
0
 def set_state(self, state):
     TFPolicyGraph.set_state(self, state[0])
     self.set_epsilon(state[1])
예제 #6
0
 def get_state(self):
     return [TFPolicyGraph.get_state(self), self.cur_epsilon]
예제 #7
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.a3c.a3c.DEFAULT_CONFIG, **config)
        self.config = config
        self.sess = tf.get_default_session()

        # Setup the policy
        self.observations = tf.placeholder(tf.float32, [None] +
                                           list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        self.model = ModelCatalog.get_model(self.observations, logit_dim,
                                            self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = tf.reshape(
            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
            [-1])
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        is_training = tf.placeholder_with_default(True, ())

        # Setup the policy loss
        if isinstance(action_space, gym.spaces.Box):
            ac_size = action_space.shape[0]
            actions = tf.placeholder(tf.float32, [None, ac_size], name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            actions = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for A3C.".format(
                    action_space))
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        v_target = tf.placeholder(tf.float32, [None], name="v_target")
        self.loss = A3CLoss(action_dist, actions, advantages, v_target,
                            self.vf, self.config["vf_loss_coeff"],
                            self.config["entropy_coeff"])

        # Initialize TFPolicyGraph
        loss_in = [
            ("obs", self.observations),
            ("actions", actions),
            ("advantages", advantages),
            ("value_targets", v_target),
        ]
        for i, ph in enumerate(self.model.state_in):
            loss_in.append(("state_in_{}".format(i), ph))
        self.state_in = self.model.state_in
        self.state_out = self.model.state_out
        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=self.observations,
                               action_sampler=action_dist.sample(),
                               loss=self.loss.total_loss,
                               loss_inputs=loss_in,
                               is_training=is_training,
                               state_inputs=self.state_in,
                               state_outputs=self.state_out,
                               seq_lens=self.model.seq_lens,
                               max_seq_len=self.config["model"]["max_seq_len"])

        if self.config.get("summarize"):
            bs = tf.to_float(tf.shape(self.observations)[0])
            tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs)
            tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs)
            tf.summary.scalar("model/entropy", self.loss.entropy / bs)
            tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
            tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
            self.summary_op = tf.summary.merge_all()

        self.sess.run(tf.global_variables_initializer())
예제 #8
0
    def __init__(self, observation_space, action_space, registry, config):
        if not isinstance(action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    action_space))

        self.config = config
        self.cur_epsilon = 1.0
        num_actions = action_space.n

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
        self.cur_observations = tf.placeholder(
            tf.float32, shape=(None,) + observation_space.shape)

        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
            q_values = _build_q_network(
                registry, self.cur_observations, num_actions, config)
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
        self.output_actions = _build_action_network(
            q_values,
            self.cur_observations,
            num_actions,
            self.stochastic,
            self.eps)

        # Replay inputs
        self.obs_t = tf.placeholder(
            tf.float32, shape=(None,) + observation_space.shape)
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(
            tf.float32, shape=(None,) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(
            tf.float32, [None], name="weight")

        # q network evaluation
        with tf.variable_scope(Q_SCOPE, reuse=True):
            q_t = _build_q_network(
                registry, self.obs_t, num_actions, config)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1 = _build_q_network(
                registry, self.obs_tp1, num_actions, config)
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(self.act_t, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if config["double_q"]:
            with tf.variable_scope(Q_SCOPE, reuse=True):
                q_tp1_using_online_net = _build_q_network(
                    registry, self.obs_tp1, num_actions, config)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(
                    q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = (
            self.rew_t +
            config["gamma"] ** config["n_step"] * q_tp1_best_masked)

        # compute the error (potentially clipped)
        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        self.loss = tf.reduce_mean(
            self.importance_weights * _huber_loss(self.td_error))

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        update_target_expr = []
        for var, var_target in zip(
            sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(self.target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        self.update_target_expr = tf.group(*update_target_expr)

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("rewards", self.rew_t),
            ("new_obs", self.obs_tp1),
            ("dones", self.done_mask),
            ("weights", self.importance_weights),
        ]
        self.is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(
            self, self.sess, obs_input=self.cur_observations,
            action_sampler=self.output_actions, loss=self.loss,
            loss_inputs=self.loss_inputs, is_training=self.is_training)
        self.sess.run(tf.global_variables_initializer())
예제 #9
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.dqn.dqn.DEFAULT_CONFIG, **config)
        if not isinstance(action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    action_space))

        self.config = config
        self.cur_epsilon = 1.0
        num_actions = action_space.n

        def _build_q_network(obs):
            return QNetwork(ModelCatalog.get_model(obs, 1, config["model"]),
                            num_actions, config["dueling"],
                            config["hiddens"]).value

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
        self.cur_observations = tf.placeholder(tf.float32,
                                               shape=(None, ) +
                                               observation_space.shape)

        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
            q_values = _build_q_network(self.cur_observations)
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
        self.output_actions = QValuePolicy(q_values, self.cur_observations,
                                           num_actions, self.stochastic,
                                           self.eps).action

        # Replay inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape)
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(tf.float32,
                                      shape=(None, ) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(tf.float32, [None],
                                                 name="weight")

        # q network evaluation
        with tf.variable_scope(Q_SCOPE, reuse=True):
            q_t = _build_q_network(self.obs_t)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1 = _build_q_network(self.obs_tp1)
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if config["double_q"]:
            with tf.variable_scope(Q_SCOPE, reuse=True):
                q_tp1_using_online_net = _build_q_network(self.obs_tp1)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)

        self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
                          self.rew_t, self.done_mask, config["gamma"],
                          config["n_step"])

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(self.target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        self.update_target_expr = tf.group(*update_target_expr)

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("rewards", self.rew_t),
            ("new_obs", self.obs_tp1),
            ("dones", self.done_mask),
            ("weights", self.importance_weights),
        ]
        self.is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=self.cur_observations,
                               action_sampler=self.output_actions,
                               loss=self.loss.loss,
                               loss_inputs=self.loss_inputs,
                               is_training=self.is_training)
        self.sess.run(tf.global_variables_initializer())
예제 #10
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.ddpg.ddpg.DEFAULT_CONFIG, **config)
        if not isinstance(action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    action_space))

        self.config = config
        self.cur_epsilon = 1.0
        dim_actions = action_space.shape[0]
        low_action = action_space.low
        high_action = action_space.high
        self.actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["actor_lr"])
        self.critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["critic_lr"])

        def _build_q_network(obs, actions):
            return QNetwork(
                ModelCatalog.get_model(obs, 1, config["model"]),
                actions,
                config["critic_hiddens"]).value

        def _build_p_network(obs):
            return PNetwork(
                ModelCatalog.get_model(obs, 1, config["model"]),
                dim_actions,
                config["actor_hiddens"]).action_scores

        def _build_action_network(p_values, stochastic, eps):
            return ActionNetwork(
                p_values,
                low_action,
                high_action,
                stochastic,
                eps,
                config["exploration_theta"],
                config["exploration_sigma"]).actions

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
        self.cur_observations = tf.placeholder(
            tf.float32, shape=(None, ) + observation_space.shape)

        # Actor: P (policy) network
        with tf.variable_scope(P_SCOPE) as scope:
            p_values = _build_p_network(self.cur_observations)
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE):
            self.output_actions = _build_action_network(
                p_values, self.stochastic, self.eps)

        with tf.variable_scope(A_SCOPE, reuse=True):
            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
            self.reset_noise_op = tf.assign(exploration_sample,
                                            dim_actions * [.0])

        # Replay inputs
        self.obs_t = tf.placeholder(
            tf.float32,
            shape=(None, ) + observation_space.shape,
            name="observation")
        self.act_t = tf.placeholder(
            tf.float32, shape=(None, ) + action_space.shape, name="action")
        self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
        self.obs_tp1 = tf.placeholder(
            tf.float32, shape=(None, ) + observation_space.shape)
        self.done_mask = tf.placeholder(tf.float32, [None], name="done")
        self.importance_weights = tf.placeholder(
            tf.float32, [None], name="weight")

        # p network evaluation
        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
            self.p_t = _build_p_network(self.obs_t)

        # target p network evaluation
        with tf.variable_scope(P_TARGET_SCOPE) as scope:
            p_tp1 = _build_p_network(self.obs_tp1)
            target_p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE, reuse=True):
            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
            zero_eps = tf.constant(value=.0, dtype=tf.float32)
            output_actions = _build_action_network(
                self.p_t, deterministic_flag, zero_eps)

            output_actions_estimated = _build_action_network(
                p_tp1, deterministic_flag, zero_eps)

        # q network evaluation
        with tf.variable_scope(Q_SCOPE) as scope:
            q_t = _build_q_network(self.obs_t, self.act_t)
            self.q_func_vars = _scope_vars(scope.name)
        with tf.variable_scope(Q_SCOPE, reuse=True):
            q_tp0 = _build_q_network(self.obs_t, output_actions)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
            q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
            target_q_func_vars = _scope_vars(scope.name)

        self.loss = ActorCriticLoss(
            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
            self.done_mask, config["gamma"], config["n_step"],
            config["use_huber"], config["huber_threshold"])

        if config["l2_reg"] is not None:
            for var in self.p_func_vars:
                if "bias" not in var.name:
                    self.loss.actor_loss += (
                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
            for var in self.q_func_vars:
                if "bias" not in var.name:
                    self.loss.critic_loss += (
                        config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
        self.tau_value = config.get("tau")
        self.tau = tf.placeholder(tf.float32, (), name="tau")
        update_target_expr = []
        for var, var_target in zip(
                sorted(self.q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        for var, var_target in zip(
                sorted(self.p_func_vars, key=lambda v: v.name),
                sorted(target_p_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(self.tau * var +
                                  (1.0 - self.tau) * var_target))
        self.update_target_expr = tf.group(*update_target_expr)

        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("rewards", self.rew_t),
            ("new_obs", self.obs_tp1),
            ("dones", self.done_mask),
            ("weights", self.importance_weights),
        ]
        self.is_training = tf.placeholder_with_default(True, ())
        TFPolicyGraph.__init__(
            self, observation_space, action_space, self.sess,
            obs_input=self.cur_observations,
            action_sampler=self.output_actions, loss=self.loss.total_loss,
            loss_inputs=self.loss_inputs, is_training=self.is_training)
        self.sess.run(tf.global_variables_initializer())

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(q_tp0, q_tp1), self.sess)

        # Hard initial update
        self.update_target(tau=1.0)
예제 #11
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(action_space)

        if existing_inputs:
            self.loss_in = existing_inputs
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logprobs_ph, vf_preds_ph = [ph for _, ph in existing_inputs]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            # Targets of the value function.
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            # Advantage values in the policy gradient estimator.
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            # Log probabilities from the policy before the policy update.
            logprobs_ph = tf.placeholder(tf.float32,
                                         name="logprobs",
                                         shape=(None, logit_dim))
            # Value function predictions before the policy update.
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            self.loss_in = [("obs", obs_ph),
                            ("value_targets", value_targets_ph),
                            ("advantages", adv_ph), ("actions", act_ph),
                            ("logprobs", logprobs_ph),
                            ("vf_preds", vf_preds_ph)]

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = ModelCatalog.get_model(obs_ph, logit_dim,
                                             self.config["model"]).outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            vf_config = self.config["model"].copy()
            # Do not split the last layer of the value function into
            # mean parameters and standard deviation parameters and
            # do not make the standard deviations free variables.
            vf_config["free_log_std"] = False
            with tf.variable_scope("value_function"):
                self.value_function = ModelCatalog.get_model(
                    obs_ph, 1, vf_config).outputs
            self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.constant("NA")

        self.loss_obj = PPOLoss(action_space,
                                value_targets_ph,
                                adv_ph,
                                act_ph,
                                logprobs_ph,
                                vf_preds_ph,
                                curr_action_dist,
                                self.value_function,
                                self.kl_coeff,
                                entropy_coeff=self.config["entropy_coeff"],
                                clip_param=self.config["clip_param"],
                                vf_loss_coeff=self.config["kl_target"],
                                use_gae=self.config["use_gae"])
        self.is_training = tf.placeholder_with_default(True, ())

        TFPolicyGraph.__init__(self,
                               observation_space,
                               action_space,
                               self.sess,
                               obs_input=obs_ph,
                               action_sampler=self.sampler,
                               loss=self.loss_obj.loss,
                               loss_inputs=self.loss_in,
                               is_training=self.is_training)