Exemplo n.º 1
0
def build_train(train_dequeue,
                num_training_steps,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                data_format=None,
                gamma=None,
                multi_step_n=1,
                double_q=True,
                scope="deepq",
                reuse=None,
                replay_buffer=None,
                prioritized_replay_eps=None,
                bellman_h=None,
                bellman_ih=None,
                use_temporal_consistency=True):
    with tf.variable_scope(scope, reuse=reuse):
        actor_num, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, idxs = train_dequeue

        # q network evaluation
        q_t = q_func(obs_t_input,
                     num_actions,
                     scope="q_func",
                     data_format=data_format)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input,
                       num_actions,
                       scope="target_q_func",
                       data_format=data_format)
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input,
                                            num_actions,
                                            scope="q_func",
                                            reuse=True,
                                            data_format=data_format)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = tf.stop_gradient((1.0 - done_mask_ph) * q_tp1_best)

        # compute RHS of bellman equation
        q_t_selected_target = bellman_h(rew_t_ph + gamma**multi_step_n *
                                        bellman_ih(q_tp1_best_masked))
        q_t_selected_target = tf.stop_gradient(q_t_selected_target)

        # compute the error (potentially clipped)
        td_error = q_t_selected - q_t_selected_target
        errors = U.huber_loss(td_error)

        # This TC component was used by Pohlen et. al. to allow higher discounting factors
        # It seems to slow down learning so I disabled for the demo, the authors claimed it improves asymptotic performance
        if use_temporal_consistency:
            q_tp1_best_using_online_net_masked = (
                1.0 - done_mask_ph) * tf.reduce_max(q_tp1_using_online_net, 1)
            tc_error = q_tp1_best_using_online_net_masked - q_tp1_best_masked
            errors = errors + U.huber_loss(tc_error)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # To avoid unnecessary copies between gpus we maintain a copy on actors GPU that is updated each iteration
        with tf.device('/gpu:0'):
            q_func(obs_t_input,
                   num_actions,
                   scope="read_q_func",
                   data_format=data_format,
                   reuse=True)
            read_q_func_vars = U.scope_vars(
                U.absolute_scope_name("read_q_func"))
        update_read_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(read_q_func_vars, key=lambda v: v.name)):
            update_read_expr.append(var_target.assign(var))
        update_read_expr = tf.group(*update_read_expr)

        if replay_buffer:
            new_priorities = tf.abs(td_error) + prioritized_replay_eps
            update_priority = replay_buffer.assign(idxs, new_priorities)
            optimize_expr = tf.group([optimize_expr, update_priority])

        with tf.control_dependencies([optimize_expr, update_read_expr]):
            train = tf.assign_add(num_training_steps, 1)

        return train, update_target_expr
Exemplo n.º 2
0
def build_dist_train(make_obs_ph,
                     dist_func,
                     num_actions,
                     num_atoms,
                     V_max,
                     optimizer,
                     grad_norm_clipping=None,
                     gamma=1.0,
                     double_q=False,
                     scope="deepq",
                     reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_dist_act(make_obs_ph,
                           dist_func,
                           num_actions,
                           num_atoms,
                           V_max,
                           scope=scope,
                           reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # value distribution network evaluation
        v_dist_t = dist_func(obs_t_input.get(),
                             num_actions,
                             scope="dist_func",
                             reuse=True)  # reuse parameters from act
        v_dist_func_vars = U.scope_vars(U.absolute_scope_name("dist_func"))

        # v_dist_t is p(x_t, a)

        # target value distribution network evalution
        v_dist_tp1 = dist_func(obs_tp1_input.get(),
                               num_actions,
                               scope="target_dist_func")
        target_v_dist_func_vars = U.scope_vars(
            U.absolute_scope_name("target_dist_func"))

        # v_dist_tp1 is p(x_(t+1), a)

        # q scores for actions which we know were selected in the given state.

        # (0) Calculate p(x_t, a_t)
        # x_t is given by ob_t_input, and a_t is given by act_t_ph

        batch_size = tf.shape(obs_t_input.get())[0]

        v_index1 = tf.range(batch_size) * tf.shape(v_dist_t)[1]
        v_index1 = tf.tile(tf.reshape(v_index1, [batch_size, 1]),
                           [1, num_atoms])

        v_index2 = act_t_ph * num_atoms  # (3, 5, 7) => (3* 51, 5* 51, 7* 51)
        v_index2 = tf.tile(tf.reshape(v_index2, [batch_size, 1]),
                           [1, num_atoms])

        v_index2 = v_index2 + tf.range(num_atoms)
        v_index = v_index1 + v_index2

        v_index = tf.reshape(v_index, [-1])

        v_dist_t_selected = tf.gather(tf.reshape(v_dist_t, [-1]), v_index)
        v_dist_t_selected = tf.reshape(v_dist_t_selected,
                                       [batch_size, num_atoms])

        #  => v_dist_t_selected is p(x_t, a_t)

        # (1) Calculate Q(X_(t+1), a)

        V_min = -V_max
        delta_z = (V_max - V_min) / (num_atoms - 1)
        q_tp1 = q_value(v_dist_tp1, num_atoms, num_actions, V_max, delta_z)

        # (2) Get argmax_a Q(X_(t+1), a)

        q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_act = U.argmax(q_tp1, axis=1)
        q_tp1_best_act = tf.cast(q_tp1_best_act, tf.int32)

        # q_tp1_best_act is a* at t+1 step.

        # (3) Extract P(x_(t+1), a*)

        v_tp_index1 = tf.range(batch_size) * tf.shape(v_dist_tp1)[1]
        v_tp_index1 = tf.tile(tf.reshape(v_tp_index1, [batch_size, 1]),
                              [1, num_atoms])

        v_tp_index2 = q_tp1_best_act * num_atoms  # (3, 5, 7) => (3* 51, 5* 51, 7* 51)
        v_tp_index2 = tf.tile(tf.reshape(v_tp_index2, [batch_size, 1]),
                              [1, num_atoms])

        # Check 1 : tf.range broadcasting
        v_tp_index2 = v_tp_index2 + tf.range(num_atoms)
        v_tp_index = v_tp_index1 + v_tp_index2

        v_tp_index = tf.reshape(v_tp_index, [-1])

        v_dist_tp1_selected = tf.gather(tf.reshape(v_dist_tp1, [-1]),
                                        v_tp_index)
        v_dist_tp1_selected = tf.reshape(v_dist_tp1_selected,
                                         [batch_size, num_atoms])

        # v_dist_tp1_selected is P(x_(t+1), a*)

        # (4) Make T_z, b_j, l, u in matrix form

        z = tf.tile(
            tf.reshape(tf.range(-V_max, V_max + delta_z, delta_z),
                       [1, num_atoms]), [batch_size, 1])
        r = tf.tile(tf.reshape(rew_t_ph, [batch_size, 1]), [1, num_atoms])

        done = tf.tile(tf.reshape(done_mask_ph, [batch_size, 1]),
                       [1, num_atoms])

        T_z = r + z * gamma * (1 - done)
        T_z = tf.maximum(
            tf.minimum(T_z, V_max),
            V_min)  # Restrict upper and lower value of T_z to V_max and V_min
        b = (T_z - V_min) / delta_z
        l, u = tf.floor(b), tf.ceil(b)
        l_id = tf.cast(l, tf.int32)
        u_id = tf.cast(u, tf.int32)

        # u, l are float, l_id, u_id are int32

        v_dist_t_selected = tf.reshape(v_dist_t_selected, [-1])
        # q_dist_tp1_selected = tf.reshape(q_dist_tp1_selected, [-1])
        add_index = tf.range(batch_size) * num_atoms

        err = tf.zeros([batch_size])

        for j in range(num_atoms):
            l_index = l_id[:, j] + add_index
            u_index = u_id[:, j] + add_index

            p_tl = tf.gather(v_dist_t_selected, l_index)
            p_tu = tf.gather(v_dist_t_selected, u_index)
            log_p_tl = tf.log(p_tl)
            log_p_tu = tf.log(p_tu)
            p_tp1 = v_dist_tp1_selected[:, j]
            err = err + p_tp1 * ((u[:, j] - b[:, j]) * log_p_tl +
                                 (b[:, j] - l[:, j]) * log_p_tu)

            # u_index = u_id[:, j]

        err = tf.negative(err)

        weighted_error = tf.reduce_mean(err)

        # q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # # compute RHS of bellman equation
        # q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # print np.shape(q_t_selected_target)

        # # compute the error (potentially clipped)
        # td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        # errors = U.huber_loss(td_error)
        # weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=v_dist_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=v_dist_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(v_dist_func_vars, key=lambda v: v.name),
                sorted(target_v_dist_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=weighted_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], v_dist_t)

        return act_f, train, update_target, {'q_dist_values': q_values}
Exemplo n.º 3
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
Exemplo n.º 4
0
def build_train(make_obs_ph,
                q_func,
                n_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """
    :param make_obs_ph: str -> tf.placeholder
    a function that create a placeholder given that name
    :param q_func: input, n_actions, scope, reuse -> tf.Tensor
    the model that takes the following paramters:
        input: tf.placeholder
        n_actions: int, number of actions
        scope: str
        reuse: bool, whether to reuse the variables from the scope
    :param n_actions: number of actions
    :param optimizer:
    :param grad_norm_clipping:
    :param gamma:
    :param double_q: bool, whether to use double q value or not
    :param scope:
    :param reuse:
    :param param_noise:
    :param param_noise_filter_func:
    :return: a bunch of functions
        act_f: function to generate actions
        train_f: function to update the main network
        update_target_f: function used to update the target network
        {}: other useful functions
    """
    if param_noise:
        raise NotImplemented()
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          n_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        gamma = tf.constant(gamma, name="gamma")
        obs_t_ph = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int64, shape=[None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, shape=[None], name="reward")
        obs_tp1_ph = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, shape=[None], name="done")
        weights_ph = tf.placeholder(tf.float32, shape=[None], name="weight")
        # q values
        q_t = q_func(obs_t_ph.get(), n_actions, scope="q_func", reuse=True)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        # target q values
        q_target_tp1 = q_func(obs_tp1_ph.get(),
                              n_actions,
                              scope="q_target_func")
        q_target_vars = U.scope_vars(U.absolute_scope_name("q_target_func"))
        if double_q:
            q_tpl1 = q_func(obs_tp1_ph.get(),
                            n_actions,
                            scope='q_func',
                            reuse=True)
            responsible_actions = tf.argmax(q_tpl1, axis=1)
            double_q_value = tf.reduce_sum(
                q_target_tp1 * tf.one_hot(responsible_actions, n_actions),
                axis=1)
        else:
            raise NotImplemented()
        double_q_value_masked = (1.0 - done_mask_ph) * double_q_value
        q_true_value = rew_t_ph + gamma * double_q_value_masked
        q_current_value = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, n_actions),
                                        axis=1)
        td_error = q_current_value - tf.stop_gradient(q_true_value)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(errors * weights_ph)
        if grad_norm_clipping is not None:
            train_op = U.minimize_and_clip(optimizer,
                                           weighted_error,
                                           q_func_vars,
                                           clip_val=grad_norm_clipping)
        else:
            train_op = optimizer.minimize(weighted_error, var_list=q_func_vars)
        with tf.variable_scope("update_target", reuse=False):
            update_target_ops = []
            for qvar, qtarget_var in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_target_vars, key=lambda v: v.name)):
                update_target_ops.append(qtarget_var.assign(qvar))
            update_target_network = tf.group(*update_target_ops)
        # create callable function
        train_f = U.make_function(inputs=[
            obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph, weights_ph
        ],
                                  outputs=td_error,
                                  updates=[train_op])
        update_target_f = U.make_function([], [],
                                          updates=[update_target_network])
        q_values_f = U.make_function([obs_t_ph], q_t)
        return act_f, train_f, update_target_f, {'q_values': q_values_f}