示例#1
0
def build_train(make_obs_ph, 
        q_func, 
        num_actions, 
        optimizer, 
        grad_norm_clipping=None, 
        gamma=1.0, 
        old_qmin = -100,
        old_qmax = 100,
        nbins = 200,
        new_qmin = -100,
        new_qmax = 100,
        double_q=False, 
        scope="deepq", 
        reuse=None, 
        param_noise=False, 
        param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    print("build_train::num_actions: ", num_actions) #OK


    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t,q_t2D = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # (1D num_actions* bins,2D num_actions,values)
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1,q_tp12D = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
        
        # DEBUG
        #print("tf.shape(act_t_ph): ", tf.shape(act_t_ph))
        #print("q_t.get_shape()): ", q_t.get_shape())
        #print("q_t2D.get_shape()): ", q_t2D.get_shape())
        #print("act_t_ph.get_shape()): ", act_t_ph.get_shape())
        #print("size.get_shape()): ", size.get_shape())
       

        # Compute train logits
        logits_list = []
        for i in range(32):
            logits_list.append( tf.nn.softmax(q_t2D[i,act_t_ph[i],:]) )
        logits_train = tf.stack(logits_list) # v_dist_t_selected
        #print("logits_train.get_shape()): ", logits_train.get_shape())
       

        # For each action, compute average Q over bins
        delta_z = (old_qmax - old_qmin)/(nbins-1)
        start = old_qmin
        end = old_qmax + delta_z
        z = tf.range(start, end, delta_z)
        #print("z.get_shape: ", z.get_shape())
        
        # Q_{target}(\phi_{j+1},a,\theta)
        q_as = []
        for action in range(num_actions):
            dist = tf.nn.softmax(q_tp1[:,nbins*action:nbins*(action+1)])
            #print("dist.get_shape: ", dist.get_shape())
            q_a = tf.reduce_sum(tf.multiply(dist, z), axis=1, keep_dims=True)
            q_as.append(q_a)

        # max_a Q_{target}(\phi_{j+1},a,\theta)
        q_target_avg = tf.concat(q_as, axis=1)
        q_tp1_best = tf.reduce_max(q_target_avg, 1) # a^*
        q_tp1_best_act = U.argmax(q_tp1_best, axis=1)
        q_tp1_best_act = tf.cast(q_tp1_best_act, tf.int32)
        print("q_tp1_best.get_shape(): ", q_tp1_best.get_shape())


        # compute RHS of bellman equation
        #q_tp1_best_masked = (1.0 - done_mask_ph) * tot_val
        #q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked# target Q value
        #q_t_selected_target_clip = tf.clip_by_value(q_t_selected_target,old_qmin,old_qmax)


        # extract P_(x_{t+1},a*)
        logits_list = []
        for i in range(32):
            logits_list.append( tf.nn.softmax(q_tp12D[i,act_t_ph[i],:]) )
        logits_target = tf.stack(logits_list) # v_dist_tp1_selected
        print("logits.get_shape()): ", logits_target.get_shape())

        # Cross entropy from (DIST_RL)
        z = tf.tile(tf.reshape(tf.range(old_qmin, old_qmax + delta_z, delta_z), [1,
            nbins]), [batch_size, 1])
        r = tf.tile(tf.reshape(rew_t_ph, [batch_size, 1]), [1, nbins])
        done = tf.tile(tf.reshape(done_mask_ph, [batch_size, 1]), [1,
            nbins])

        Tz = r + z * gamma * (1-done)
        T_z = tf.maximum(tf.minimum(T_z, old_qmax), old_qmin)
        b = (Tz - old_qmin)/delta_z # Should be float
        l,u = tf.floor(b), tf.ceil(b)
        l_id = tf.cast(l, tf.int32)
        u_id = tf.cast(u, tf.int32)
        
        v_t_dist_selected = tf.reshape(logits_train,[-1]) # P(x_t, a_t)
        add_index = tf.range(batch_size) * nbins

        err = tf.zeros([batch_size])

        for j in range(nbins):
            l_index = l_id[:, j] + add_index
            u_index = u_id[:, j] + add_index

            p_tl = tf.gather(v_dist_t_selected, l_index)
            p_tu = tf.gather(v_dist_t_selected, u_index)

            log_p_tl = tf.log(p_tl)
            log_p_tu = tf.log(p_tu)
            p_tp1 = logits_target[:,j]

            err = err + p_tp1 * ((u[:,j] - b[:,j]) * log_p_tl + (b[:,j] -
                l[:,j]) * log_p_tu)

        err = tf.negative(err)

        # compute the error (potentially clipped)
        weighted_error = tf.reduce_mean(importance_weights_ph * err)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[new_errors],

            updates=[optimize_expr]
        )
        val = U.function( # this is added only to monitor if values are calculated correctly
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[new_errors],
            #,q_t,q_tp1, q_t_selected , q_t_selected_target , q_tp1_best ,tot_val,q_t_val  ]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t2D)

        return act_f, train, update_target, {'q_values': q_values} , val
示例#2
0
 def mode(self):
     return U.argmax(self.logits, axis=1)
 def mode(self):
     # self.logits = tf.Print(self.logits, [self.logits, tf.reduce_sum(1 / (1 + tf.exp(self.logits[:]))), U.argmax(self.logits, axis=1)])
     # self.logits = tf.Print(self.logits, [tf.abs(1 / (1 + tf.exp(self.logits[:])) - 0.5)])
     return U.argmax(self.logits, axis=1)