def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, old_qmin = -100, old_qmax = 100, nbins = 200, new_qmin = -100, new_qmax = 100, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) print("build_train::num_actions: ", num_actions) #OK with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t,q_t2D = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # (1D num_actions* bins,2D num_actions,values) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1,q_tp12D = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # DEBUG #print("tf.shape(act_t_ph): ", tf.shape(act_t_ph)) #print("q_t.get_shape()): ", q_t.get_shape()) #print("q_t2D.get_shape()): ", q_t2D.get_shape()) #print("act_t_ph.get_shape()): ", act_t_ph.get_shape()) #print("size.get_shape()): ", size.get_shape()) # Compute train logits logits_list = [] for i in range(32): logits_list.append( tf.nn.softmax(q_t2D[i,act_t_ph[i],:]) ) logits_train = tf.stack(logits_list) # v_dist_t_selected #print("logits_train.get_shape()): ", logits_train.get_shape()) # For each action, compute average Q over bins delta_z = (old_qmax - old_qmin)/(nbins-1) start = old_qmin end = old_qmax + delta_z z = tf.range(start, end, delta_z) #print("z.get_shape: ", z.get_shape()) # Q_{target}(\phi_{j+1},a,\theta) q_as = [] for action in range(num_actions): dist = tf.nn.softmax(q_tp1[:,nbins*action:nbins*(action+1)]) #print("dist.get_shape: ", dist.get_shape()) q_a = tf.reduce_sum(tf.multiply(dist, z), axis=1, keep_dims=True) q_as.append(q_a) # max_a Q_{target}(\phi_{j+1},a,\theta) q_target_avg = tf.concat(q_as, axis=1) q_tp1_best = tf.reduce_max(q_target_avg, 1) # a^* q_tp1_best_act = U.argmax(q_tp1_best, axis=1) q_tp1_best_act = tf.cast(q_tp1_best_act, tf.int32) print("q_tp1_best.get_shape(): ", q_tp1_best.get_shape()) # compute RHS of bellman equation #q_tp1_best_masked = (1.0 - done_mask_ph) * tot_val #q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked# target Q value #q_t_selected_target_clip = tf.clip_by_value(q_t_selected_target,old_qmin,old_qmax) # extract P_(x_{t+1},a*) logits_list = [] for i in range(32): logits_list.append( tf.nn.softmax(q_tp12D[i,act_t_ph[i],:]) ) logits_target = tf.stack(logits_list) # v_dist_tp1_selected print("logits.get_shape()): ", logits_target.get_shape()) # Cross entropy from (DIST_RL) z = tf.tile(tf.reshape(tf.range(old_qmin, old_qmax + delta_z, delta_z), [1, nbins]), [batch_size, 1]) r = tf.tile(tf.reshape(rew_t_ph, [batch_size, 1]), [1, nbins]) done = tf.tile(tf.reshape(done_mask_ph, [batch_size, 1]), [1, nbins]) Tz = r + z * gamma * (1-done) T_z = tf.maximum(tf.minimum(T_z, old_qmax), old_qmin) b = (Tz - old_qmin)/delta_z # Should be float l,u = tf.floor(b), tf.ceil(b) l_id = tf.cast(l, tf.int32) u_id = tf.cast(u, tf.int32) v_t_dist_selected = tf.reshape(logits_train,[-1]) # P(x_t, a_t) add_index = tf.range(batch_size) * nbins err = tf.zeros([batch_size]) for j in range(nbins): l_index = l_id[:, j] + add_index u_index = u_id[:, j] + add_index p_tl = tf.gather(v_dist_t_selected, l_index) p_tu = tf.gather(v_dist_t_selected, u_index) log_p_tl = tf.log(p_tl) log_p_tu = tf.log(p_tu) p_tp1 = logits_target[:,j] err = err + p_tp1 * ((u[:,j] - b[:,j]) * log_p_tl + (b[:,j] - l[:,j]) * log_p_tu) err = tf.negative(err) # compute the error (potentially clipped) weighted_error = tf.reduce_mean(importance_weights_ph * err) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[new_errors], updates=[optimize_expr] ) val = U.function( # this is added only to monitor if values are calculated correctly inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[new_errors], #,q_t,q_tp1, q_t_selected , q_t_selected_target , q_tp1_best ,tot_val,q_t_val ] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t2D) return act_f, train, update_target, {'q_values': q_values} , val
def mode(self): return U.argmax(self.logits, axis=1)
def mode(self): # self.logits = tf.Print(self.logits, [self.logits, tf.reduce_sum(1 / (1 + tf.exp(self.logits[:]))), U.argmax(self.logits, axis=1)]) # self.logits = tf.Print(self.logits, [tf.abs(1 / (1 + tf.exp(self.logits[:])) - 0.5)]) return U.argmax(self.logits, axis=1)