示例#1
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
示例#2
0
def p_train(make_obs_ph_n,
            act_space_n,
            agent_idx,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """

    :param make_obs_ph_n:
    :param act_space_n:
    :param agent_idx:
    :param p_func: in base maddpg code = mlp_model
    :param q_func: in base maddpg code = mlp_model
    :param optimizer:
    :param grad_norm_clipping:
    :param local_q_func:
    :param num_units:
    :param scope:
    :param reuse:
    :return:
    """
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[agent_idx]

        p = p_func(p_input,
                   int(act_pdtype_n[agent_idx].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[agent_idx].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[agent_idx] = act_pd.sample()  #act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)

        q = q_func(q_input,
                   1,
                   scope="q_func" + str(1),
                   reuse=True,
                   num_units=num_units)[:, 0]

        loss = -tf.reduce_mean(q) + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=make_obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample)
        p_values = U.function([make_obs_ph_n[agent_idx]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[agent_idx].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[agent_idx].pdfromflat(
            target_p).sample()
        target_act = U.function(inputs=[make_obs_ph_n[agent_idx]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
示例#3
0
    def __init__(self, input_space, act_space, scope, args):
        self.input_shape = input_space
        self.act_space = act_space
        self.scope = scope
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        self.grad_norm_clipping = 0.5
        with tf.variable_scope(self.scope):
            act_pdtype = make_pdtype(act_space)

            # act_ph = act_pdtype.sample_placeholder([None], name= "action")
            act_ph = tf.placeholder(tf.float32, shape=(None, 1))
            if args.game == "RoboschoolPong-v1":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0]))
            elif args.game == "Pong-2p-v0":
                obs_ph = tf.placeholder(tf.float32,
                                        shape=(None, input_space.shape[0],
                                               input_space.shape[1],
                                               input_space.shape[2]))
            q_target = tf.placeholder(tf.float32, shape=(None, ))

            #build the world representation z
            z = conv_model(obs_ph, 20, scope="world_model")
            p_input = z

            p = mlp_model(p_input, 2, scope="p_func")
            p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

            act_pd = act_pdtype.pdfromflat(p)
            act_sample = act_pd.sample()

            p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

            q_input = tf.concat([z, act_sample], -1)
            q = mlp_model(q_input, 1, scope="q_func")
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            pg_loss = -tf.reduce_mean(q)

            q_loss = tf.reduce_mean(tf.square(q - q_target))
            # q_reg = tf.reduce_mean(tf.square(q))
            q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss,
                                                  q_func_vars,
                                                  self.grad_norm_clipping)

            p_loss = pg_loss + p_reg * 1e-3

            p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss,
                                                  p_func_vars,
                                                  self.grad_norm_clipping)

            p_values = U.function([obs_ph], p)

            target_p = mlp_model(z, 2, scope="target_p_func")
            target_p_func_vars = U.scope_vars(
                U.absolute_scope_name("target_p_func"))

            target_q = mlp_model(q_input, 1, scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))
            target_act_sample = act_pdtype.pdfromflat(target_p).sample()

            self.update_target_p = make_update_exp(p_func_vars,
                                                   target_p_func_vars)
            self.update_target_q = make_update_exp(q_func_vars,
                                                   target_q_func_vars)

            self.act = U.function(inputs=[obs_ph], outputs=act_sample)
            self.target_act = U.function(inputs=[obs_ph],
                                         outputs=target_act_sample)
            self.p_train = U.function(inputs=[obs_ph] + [act_ph],
                                      outputs=p_loss,
                                      updates=[p_optimize_expr])
            self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target],
                                      outputs=q_loss,
                                      updates=[q_optimize_expr])
            self.q_values = U.function([obs_ph] + [act_ph], q)
            self.target_q_values = U.function([obs_ph] + [act_ph], target_q)
示例#4
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                num_action_streams,
                batch_size,
                optimizer_name,
                learning_rate,
                grad_norm_clipping=None,
                gamma=0.99,
                double_q=True,
                scope="deepq",
                reuse=None,
                loss_type="L2"):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        total number of sub-actions to be represented at the output  
    num_action_streams: int
        specifies the number of action branches in action value (or advantage) function representation
    batch_size: int
        size of the sampled mini-batch from the replay buffer 
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for deep Q-learning 
    grad_norm_clipping: float or None
        clip graident norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q-Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled. BDQ uses it. 
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select an action given an observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    act_f, q_f = build_act(make_obs_ph,
                           q_func,
                           num_actions,
                           num_action_streams,
                           scope=scope,
                           reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # Set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None, num_action_streams],
                                  name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # Q-network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # Target Q-network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        if double_q:
            selection_q_tp1 = q_func(obs_tp1_input.get(),
                                     num_actions,
                                     scope="q_func",
                                     reuse=True)
        else:
            selection_q_tp1 = q_tp1

        num_actions_pad = num_actions // num_action_streams

        q_values = []
        for dim in range(num_action_streams):
            selected_a = tf.squeeze(
                tf.slice(act_t_ph, [0, dim], [batch_size, 1]))  # TODO better?
            q_values.append(
                tf.reduce_sum(tf.one_hot(selected_a, num_actions_pad) *
                              q_t[dim],
                              axis=1))

        target_q_values = []
        for dim in range(num_action_streams):
            selected_a = tf.argmax(selection_q_tp1[dim], axis=1)
            selected_q = tf.reduce_sum(
                tf.one_hot(selected_a, num_actions_pad) * q_tp1[dim], axis=1)
            masked_selected_q = (1.0 - done_mask_ph) * selected_q
            target_q = rew_t_ph + gamma * masked_selected_q
            target_q_values.append(target_q)

        if optimizer_name == "Adam":
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            assert False, 'unsupported optimizer ' + str(optimizer_name)

        if loss_type == "L2":
            loss_function = tf.square
        elif loss_type == "Huber":
            loss_function = U.huber_loss
        else:
            assert False, 'unsupported loss type ' + str(loss_type)

        stream_losses = []
        for dim in range(num_action_streams):
            dim_td_error = q_values[dim] - tf.stop_gradient(
                target_q_values[dim])
            dim_loss = loss_function(dim_td_error)
            # Scaling of learning based on importance sampling weights is optional, either way works
            stream_losses.append(
                tf.reduce_mean(dim_loss *
                               importance_weights_ph))  # with scaling
            if dim == 0:
                td_error = tf.abs(dim_td_error)
            else:
                td_error += tf.abs(dim_td_error)

        mean_loss = sum(stream_losses) / num_action_streams
        optimize_expr = U.minimize_and_clip(
            optimizer,
            mean_loss,
            var_list=q_func_vars,
            total_n_streams=(num_action_streams),
            clip_val=grad_norm_clipping)
        optimize_expr = [optimize_expr]

        # Target Q-network parameters are periodically updated with the Q-network's
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=optimize_expr)
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, q_f, train, update_target, {'q_values': q_values}