def adv_build_train(make_obs_ph, v_func, adv_func, num_actions, learning_rate, en, grad_norm_clipping=None, gamma=0.99, scope="advantage_learning", reuse=None, ): act_f, is_training = adv_build_act(make_obs_ph, adv_func, num_actions, en=en, scope=scope, reuse=reuse,) with tf.variable_scope(scope, reuse=reuse): adv_func_vars_list = [] target_adv_func_vars_list = [] error_list = [] # construct placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") obs_t_input_list = tf.split(obs_t_input.get(), en, axis=0) act_t_ph_list = tf.split(act_t_ph, en, axis=0) rew_t_ph_list = tf.split(rew_t_ph, en, axis=0) obs_tp1_input_list = tf.split(obs_tp1_input.get(), en, axis=0) done_mask_ph_list = tf.split(done_mask_ph, en, axis=0) # build v function v_t = tf.squeeze(v_func(obs_t_input.get(), scope="v_func", reuse=False)) v_t_list = tf.split(v_t, en, axis=0) v_func_vars = U.scope_vars(U.absolute_scope_name("v_func")) # build v target v_tp1 = tf.squeeze(v_func(obs_tp1_input.get(), scope="target_v_func", reuse=False)) v_tp1_list = tf.split(v_tp1, en, axis=0) target_v_func_vars = U.scope_vars(U.absolute_scope_name("target_v_func")) for count in range(en): # build BNN adv_t = adv_func(obs_t_input_list[count], num_actions, is_training=is_training, scope="adv_func" + str(count) + '_', reuse=True, ) adv_func_vars = U.scope_vars(U.absolute_scope_name("adv_func" + str(count) + '_')) adv_func_vars_list += adv_func_vars # build BNN target adv_tp1 = adv_func(obs_tp1_input_list[count], num_actions, is_training=False, scope="target_adv_func" + str(count) + '_', ) target_adv_func_vars_list += U.scope_vars(U.absolute_scope_name("target_adv_func" + str(count) + '_')) adv_t_selected = tf.reduce_sum(adv_t * tf.one_hot(act_t_ph_list[count], num_actions), 1) adv_tp1_best = tf.reduce_max(adv_tp1, 1) q_t_selected = v_t_list[count] + adv_t_selected q_tp1_best = v_tp1_list[count] + adv_tp1_best q_tp1_best_masked = (1.0 - done_mask_ph_list[count]) * q_tp1_best q_t_selected_target = rew_t_ph_list[count] + gamma * q_tp1_best_masked td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf.reduce_mean(tf.square(td_error)) error_list.append(errors) all_vars_list = v_func_vars + adv_func_vars_list all_target_vars_list = target_v_func_vars + target_adv_func_vars_list total_loss = sum(error_list) assert grad_norm_clipping is not None optimize_expr = U.minimize_and_clip( tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4), total_loss, var_list=all_vars_list, clip_val=grad_norm_clipping ) update_target_expr = [] for var, var_target in zip(sorted(all_vars_list, key=lambda v: v.name), sorted(all_target_vars_list, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, is_training, ], outputs=error_list, updates=[optimize_expr], givens={is_training:True} ) update_target = U.function([], [], updates=[update_target_expr]) return act_f, train, update_target
def build_train(make_obs_ph, q_func, num_actions, optimizer, train_gaze, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="DeepqWithGaze", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") initial_freeze_phase_ph = tf.placeholder(tf.bool, (), name="initial_freeze_phase") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = gflag.qfunc_models.get( "q_func").weights # already includes gaze_models weights q_func_trainable_vars = [ w for w in gflag.qfunc_models.get("q_func").trainable_weights \ if (train_gaze or w not in gflag.gaze_models.get("q_func").trainable_weights) ] # train_gaze=False excludes gaze model's weight # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = gflag.qfunc_models.get( "target_q_func").weights # already includes gaze_models weights # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) initial_freeze_weights = gflag.qfunc_models.get_weight_names_for_initial_freeze( model_name="q_func") q_func_trainable_vars_for_initial_freeze = list( filter(lambda w: w.name not in initial_freeze_weights, q_func_trainable_vars)) if grad_norm_clipping is not None: optimize_expr_for_initial_freeze = lambda: U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_trainable_vars_for_initial_freeze, clip_val=grad_norm_clipping) \ if q_func_trainable_vars_for_initial_freeze else tf.no_op() optimize_expr_after_freeze = lambda: U.minimize_and_clip( optimizer, weighted_error, var_list=q_func_trainable_vars, clip_val=grad_norm_clipping) else: # must put the operation under lambda, if you fully read tf.cond()'s documentation optimize_expr_for_initial_freeze = lambda: optimizer.minimize( weighted_error, var_list=q_func_trainable_vars_for_initial_freeze) optimize_expr_after_freeze = lambda: optimizer.minimize( weighted_error, var_list=q_func_trainable_vars) optimize_expr = tf.cond(initial_freeze_phase_ph, optimize_expr_for_initial_freeze, optimize_expr_after_freeze) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] assert len(q_func_vars) == len(target_q_func_vars) for var, var_target in zip(q_func_vars, target_q_func_vars): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, initial_freeze_phase_ph, ], outputs=td_error, updates=[optimize_expr], givens={K.backend.learning_phase(): 1}) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) # For tensorboard merged = tf.summary.merge([ tf.summary.image('img_curframe', obs_t_input.get()), tf.summary.image( 'gaze_curframe', q_func(obs_t_input.get(), num_actions, scope="q_func", return_gaze=True, reuse=True)) ]) tensorboard_summary = U.function( inputs=[obs_t_input], outputs=merged, givens={K.backend.learning_phase(): 0}) return act_f, train, update_target, { 'q_values': q_values }, tensorboard_summary
def build_train_modelbased(make_obs_ph, net_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", latent_dim=32, input_dim=84 * 84 * 4, hash_dim=32, K=10, beta=0.1, predict=True, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ z_func = build_act_modelbased(make_obs_ph, net_func, num_actions, scope=scope, secondary_scope="net_func", reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query")) obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos")) obs_mc_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg")) obs_mc_input_model_t = U.ensure_tf_input(make_obs_ph("obs_query")) obs_mc_input_model_tp1 = U.ensure_tf_input(make_obs_ph("obs_query")) reward_input_model = tf.placeholder(tf.float32, [None], name='reward') action_input_model = tf.placeholder(tf.int32, [None], name='action') latent_input_out = tf.placeholder(tf.float32, [None, latent_dim], name='latent') action_input_out = tf.placeholder(tf.int32, [None], name='action_input_out') # inputs = [obs_mc_input] # inputs = [tau, obs_mc_input_query, obs_mc_input_positive, obs_mc_input_negative] inputs = [ tau, obs_mc_input_query, obs_mc_input_positive, obs_mc_input_negative, obs_mc_input_model_t, obs_mc_input_model_tp1, reward_input_model, action_input_model ] z_mc_model_t, _ = net_func(obs_mc_input_model_t.get(), num_actions, scope="net_func", reuse=True) z_mc_model_tp1, _ = net_func(obs_mc_input_model_tp1.get(), num_actions, scope="net_func", reuse=True) z_mc_out, reward_out = model_func(latent_input_out, action_input_out, num_actions, scope="model_func", reuse=reuse) z_mc_model_tp1_predict, reward_predict = model_func(z_mc_model_t, action_input_model, num_actions, scope="model_func", reuse=True) z_mc, _ = net_func(obs_mc_input_query.get(), num_actions, scope="net_func", reuse=True) # _, v_mc = net_func( # obs_mc_input_query.get(), num_actions, # scope="net_func", # reuse=True) z_mc_pos, v_mc_pos = net_func(obs_mc_input_positive.get(), num_actions, scope="net_func", reuse=True) z_mc_neg, v_mc_neg = net_func(obs_mc_input_negative.get(), num_actions, scope="net_func", reuse=True) z_mc_pos = tf.reshape(z_mc_pos, [-1, 1, latent_dim]) z_mc = tf.reshape(z_mc, [-1, latent_dim, 1]) z_mc_neg = tf.reshape(z_mc_neg, [-1, K, latent_dim]) negative = tf.matmul(z_mc_neg, z_mc) / tau sum_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1)) positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc) / tau) print("shape:", z_mc.shape, z_mc_pos.shape, z_mc_neg.shape, sum_negative.shape, negative.shape, positive.shape) contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive) # # print("shape2:", z_mc.shape, negative.shape, positive.shape) # # prediction_loss = tf.losses.mean_squared_error(value_input, v_mc) # total_loss = contrast_loss # if predict: # total_loss += beta * prediction_loss model_func_vars = U.scope_vars( U.absolute_scope_name("model_func")) + U.scope_vars( U.absolute_scope_name("net_func")) # encoder_net_func_vars = U.scope_vars(U.absolute_scope_name("encoder_net_func")) transition_loss = tf.reduce_sum( tf.square(z_mc_model_tp1 - z_mc_model_tp1_predict)) reward_loss = tf.reduce_sum( tf.square(reward_predict - reward_input_model)) total_loss = contrast_loss + transition_loss + reward_loss if grad_norm_clipping is not None: optimize_expr_contrast_with_prediction = U.minimize_and_clip( optimizer, total_loss, var_list=model_func_vars, clip_val=grad_norm_clipping) else: optimize_expr_contrast_with_prediction = optimizer.minimize( total_loss, var_list=model_func_vars) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc_model_t, axis=1))) negative_summary = tf.summary.scalar( "negative", tf.reduce_mean(tf.reduce_mean(negative))) positive_summary = tf.summary.scalar( "positive", tf.reduce_mean(tf.reduce_mean(positive))) contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) transition_loss_summary = tf.summary.scalar( "transition loss", tf.reduce_mean(transition_loss)) trivial_loss_summary = tf.summary.scalar( "trivial loss", tf.reduce_mean(tf.square(z_mc_model_t - z_mc_model_tp1))) reward_loss_summary = tf.summary.scalar("reward loss", tf.reduce_mean(reward_loss)) # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss)) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) summaries = [ z_var_summary, negative_summary, positive_summary, contrast_loss_summary, trivial_loss_summary, transition_loss_summary, reward_loss_summary, total_loss_summary ] summary = tf.summary.merge(summaries) train = U.function(inputs=inputs, outputs=[total_loss, summary], updates=[optimize_expr_contrast_with_prediction]) prediction = U.function(inputs=[latent_input_out, action_input_out], outputs=[z_mc_out, reward_out]) return z_func, prediction, train
key=lambda x: x.name)): if (main_var.name.replace("train_base_net", "") == target_var.name.replace( "target_base_net", "")): assign_ops.append(tf.assign(target_var, main_var)) print("Copying Ops.:", len(assign_ops)) copy_operation = tf.group(*assign_ops) from collections import deque replay_buffer = deque(maxlen=50000) optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) from baselines.common import tf_util train_step = tf_util.minimize_and_clip(optimizer, iqn.loss, var_list=train_variables) optimizer_sampling = tf.train.AdamOptimizer(learning_rate=1e-2) train_step_sampling = tf_util.minimize_and_clip(optimizer_sampling, iqn.sampling_loss, var_list=sampling_variables) def train(x, a, r=None, x_p=None, t=None, true_return=None): if true_return is not None: return sess.run( [iqn.sampling_loss, train_step_sampling], feed_dict={ iqn.train_net.state: x, iqn.action_placeholder: a,
key=lambda x: x.name)): if (main_var.name.replace("train_base_net", "") == target_var.name.replace( "target_base_net", "")): assign_ops.append(tf.assign(target_var, main_var)) print("Copying Ops.:", len(assign_ops)) copy_operation = tf.group(*assign_ops) from collections import deque replay_buffer = deque(maxlen=50000) optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) from baselines.common import tf_util train_step = tf_util.minimize_and_clip(optimizer, iqn.loss, var_list=train_variables) def train(x, a, r, x_p, t): return sess.run( [iqn.loss, train_step], feed_dict={ iqn.train_net.state: x, iqn.action_placeholder: a, iqn.r: r, iqn.t: t, iqn.target_net.state: x_p })
def imit_build_train( make_obs_ph, bnn_func, learning_rate, num_actions, en, # raw_input_ph is the placeholder accept the raw image raw_input_ph, target_output, gamma, grad_norm_clipping=None, alpha=20, bnn_explore=0.01, scope="Imitation", reuse=None, use_sign=False, ): bnn_act_f, is_training = imit_build_act( make_obs_ph, bnn_func, num_actions, bnn_explore=bnn_explore, en=en, scope=scope, reuse=reuse, use_sign=use_sign, ) with tf.variable_scope(scope, reuse=reuse): loss_list = [] bnn_func_vars_list = [] obs_t = tf.cast(raw_input_ph, tf.float32) / 255.0 obs_t_list = tf.split(obs_t, en, axis=0) target_output_list = tf.split(target_output, en, axis=0) # TODO accu_list = [] target_label_list = tf.split(tf.argmax(target_output, axis=1), en, axis=0) for count in range(en): bnn_output = bnn_func(obs_t_list[count], num_actions, scope="bnn_func" + str(count) + '_', is_training=is_training, reuse=True) bnn_func_vars = U.scope_vars( U.absolute_scope_name("bnn_func" + str(count) + '_')) bnn_func_vars_list += bnn_func_vars loss_list.append( tf.reduce_mean( tf.square(bnn_output - alpha * target_output_list[count]))) #TODO predict = tf.argmax(bnn_output, axis=1) accu = tf.reduce_mean( tf.cast(tf.equal(predict, target_label_list[count]), "float")) accu_list.append(accu) total_loss = sum(loss_list) assert grad_norm_clipping is not None optimize_expr = U.minimize_and_clip(tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=1e-4), total_loss, var_list=bnn_func_vars_list, clip_val=grad_norm_clipping) train = U.function( inputs=[raw_input_ph, is_training], # TODO outputs=accu_list + loss_list, updates=[optimize_expr], givens={is_training: True}, ) return bnn_act_f, train
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip graident norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, var_func, cvar_func, num_actions, nb_atoms, optimizer, grad_norm_clipping=None, gamma=1.0, scope="cvar_dqn", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name var_func: (tf.Variable, int, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions nb_atoms: int number of atoms scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. cvar_func: (tf.Variable, int, str, bool) -> tf.Variable see var_func num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, cvar_func, var_func, num_actions, nb_atoms, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # atoms y = tf.range(1, nb_atoms + 1, dtype=tf.float32, name='y') * 1. / nb_atoms # ------------------------------- Core networks --------------------------------- # var network var_t = var_func(obs_t_input.get(), num_actions, nb_atoms, scope="out_func", reuse_main=True, reuse_last=True) # reuse from act # vars for actions which we know were selected in the given state. var_t_selected = gather_along_second_axis(var_t, act_t_ph) var_t_selected.set_shape([None, nb_atoms]) # cvar network cvar_t = cvar_func(obs_t_input.get(), num_actions, nb_atoms, scope="out_func", reuse_main=True, reuse_last=True) # reuse from act # cvars for actions which we know were selected in the given state. cvar_t_selected = gather_along_second_axis(cvar_t, act_t_ph) cvar_t_selected.set_shape([None, nb_atoms]) # target cvar network cvar_tp1 = cvar_func(obs_tp1_input.get(), num_actions, nb_atoms, scope="target_cvar_func") # extract variables joint_variables = U.scope_vars(U.absolute_scope_name("out_func/net")) var_variables = U.scope_vars(U.absolute_scope_name("out_func/var")) cvar_variables = U.scope_vars(U.absolute_scope_name("out_func/cvar")) target_cvar_func_variables = U.scope_vars( U.absolute_scope_name("target_cvar_func")) # ------------------------------------------------------------------------------- # ----------------------------- Extract distribution ---------------------------- # construct a new cvar with different actions for each atom cvar_tp1_star = tf.reduce_max(cvar_tp1, axis=1) cvar_tp1_star.set_shape([None, nb_atoms]) # construct a distribution from the new cvar ycvar_tp1_star = cvar_tp1_star * y dist_tp1_star_ = extract_distribution(ycvar_tp1_star, nb_atoms) # apply done mask dist_tp1_star = tf.einsum('ij,i->ij', dist_tp1_star_, 1. - done_mask_ph) # Td = r + gamma * dist dist_target = tf.identity(rew_t_ph[:, None] + gamma * dist_tp1_star, name='dist_target') # dist is always non-differentiable dist_target = tf.stop_gradient(dist_target) # ------------------------------------------------------------------------------- # ---------------------------------- VaR loss ----------------------------------- td_error = dist_target[:, :, None] - var_t_selected[:, None, :] # td_error[0]= # [[Td1-v1 Td1-v2 ... Td1-vn] # [Td2-v1 Td2-v2 ... Td2-vn] # [... ] # [Tdn-v1 Tdn-v2 ... Tdn-vn]] negative_indicator = tf.cast(td_error < 0, tf.float32) var_weights = tf.stop_gradient( y - negative_indicator) # XXX: stop gradient? quantile_loss = var_weights * td_error var_error = tf.reduce_mean(quantile_loss) # ------------------------------------------------------------------------------- # ---------------------------------- CVaR loss ---------------------------------- # Minimizing the MSE of: # V_i + 1/y_i(Td_j - V_i)^- - C_i min_target_diff = negative_indicator / y * tf.stop_gradient(td_error) cvar_loss = tf.stop_gradient( var_t_selected )[:, None, :] + min_target_diff - cvar_t_selected[:, None, :] cvar_error = tf.reduce_mean(tf.square(cvar_loss)) # ------------------------------------------------------------------------------- # ------------------------------- Finalizing ------------------------------------ error = var_error + cvar_error # compute optimization op (potentially with gradient clipping) var_list = [joint_variables, var_variables, cvar_variables] if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, error, var_list, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(error, var_list=var_list) # update_target_fn will be called periodically to copy cvar network to target cvar network # Note: var has no target update_target_expr = [] for cvar_variable, target_cvar_variable in zip( sorted(joint_variables + cvar_variables, key=lambda v: v.name), sorted(target_cvar_func_variables, key=lambda v: v.name)): update_target_expr.append( target_cvar_variable.assign(cvar_variable)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) # ------------------------------------------------------------------------------- # --------------------------------- Debug --------------------------------------- # a = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t_selected) # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t_selected) # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_dist_target*y) # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t) # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], negative_indicator) # d = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_yc_target) # e = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t) # f = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_loss) # atoms = U.function([obs_tp1_input], atoms) # ------------------------------------------------------------------------------- return act_f, train, update_target, []
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, distributed=False, v_min=-10.0, v_max=10.0, atoms=51): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. distributed: bool whether or not distributed version is enabled. v_min: float lower boundary for value, only works when distributed version is enabled. v_max: float upper boundary for value, only works when distributed version is enabled. atoms: int number of atoms, only works when distributed version is enabled. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ print("build train use distributed? ", distributed) if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func, distributed=distributed, v_min=v_min, v_max=v_max, atoms=atoms) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, distributed=distributed, v_min=v_min, v_max=v_max, atoms=atoms) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") distributed_target_ph = tf.placeholder(tf.float32, [None, atoms], name="dis_target") # q network evaluation if not distributed: q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") else: q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. if not distributed: q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t_ph, num_actions), 1) else: probability_qt = tf.nn.softmax(q_t) q_t_selected = tf.reduce_sum( q_t * tf.tile(tf.expand_dims(tf.one_hot(act_t_ph, num_actions), 2), [1, 1, atoms]), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: print("use double") if not distributed: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max( q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best = get_distibute_q(q_tp1_using_online_net, v_min, v_max, atoms, obs_tp1_input) a_tp1_best = tf.argmax(q_tp1_best, 1) probability_qt1 = tf.nn.softmax(q_tp1_using_online_net) q_tp1_best = tf.reduce_sum( probability_qt1 * tf.tile( tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2), [1, 1, atoms]), 1) else: print("not use double") if not distributed: q_tp1_best = tf.reduce_max(q_tp1, 1) else: if distributed: q_tp1_best = get_distibute_q(q_tp1, v_min, v_max, atoms, obs_tp1_input) a_tp1_best = tf.argmax(q_tp1_best, 1) probability_qt1 = tf.nn.softmax(q_tp1) q_tp1_best = tf.reduce_sum( probability_qt1 * tf.tile( tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2), [1, 1, atoms]), 1) mask = 1.0 - done_mask_ph if not distributed: q_tp1_best_masked = mask * q_tp1_best else: q_tp1_best_masked = q_tp1_best # compute RHS of bellman equation if not distributed: q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) else: clip_target = tf.clip_by_value(distributed_target_ph, 1e-8, 1.0) clip_select = tf.clip_by_value(tf.nn.softmax(q_t_selected), 1e-8, 1.0) # use kl divergence td_error = tf.reduce_sum( clip_target * (tf.log(clip_target) - tf.log(clip_select)), axis=-1) errors = tf.nn.softmax_cross_entropy_with_logits( labels=distributed_target_ph, logits=q_t_selected) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions if distributed: train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, distributed_target_ph ], outputs=td_error, updates=[optimize_expr]) else: train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) q_tp1_best_final = U.function([obs_tp1_input], q_tp1_best) return act_f, train, update_target, { 'q_values': q_values, 'q_t1_best': q_tp1_best_final }
def build_train_ib(make_obs_ph, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, beta=1.0, theta=1, double_q=True, emdqn=True, vae=True, ib=True, scope="deepq_ib", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. beta: float coefficient of beta-ib. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_noise = tf.placeholder(tf.float32, [None, 512], name="act_noise") act_f = build_act_ib(make_obs_ph, model_func, act_noise, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") z_noise_t = tf.placeholder(tf.float32, [None, 512], name="z_noise") z_noise_tp1 = tf.placeholder(tf.float32, [None, 512], name="z_noise_tp1") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") inputs = [ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, act_noise, z_noise_t, z_noise_tp1 ] # EMDQN if emdqn or ib: qec_input = tf.placeholder(tf.float32, [None], name='qec') inputs.append(qec_input) if ib or vae: obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae")) z_noise_vae = tf.placeholder(tf.float32, [None, 512], name="z_noise_vae") inputs.append(obs_vae_input) inputs.append(z_noise_vae) # q network evaluation q_t, v_mean_t, v_logvar_t, z_mean_t, z_logvar_t, recon_obs_t = model_func( obs_t_input.get(), z_noise_t, num_actions, scope="q_func", reuse=True) if vae or ib: q_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = model_func( obs_vae_input.get(), z_noise_vae, num_actions, scope="q_func", reuse=True) # q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1, q_d_tp1, v_mean_tp1, v_logvar_tp1, z_mean_tp1, z_logvar_tp1, recon_obs_tp1 = model_func( obs_tp1_input.get(), z_noise_tp1, num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net, _, _, _, _, _, _ = model_func( obs_tp1_input.get(), z_noise_tp1, num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) td_loss = tf.reduce_mean(importance_weights_ph * U.huber_loss(td_error)) outputs = [td_loss] total_loss = td_loss if vae or ib: encoder_loss = -1 + z_mean_vae**2 + tf.exp( z_logvar_vae) - z_logvar_vae outputs.append(encoder_loss) total_loss += 0.1 * tf.reduce_mean(beta * encoder_loss) if vae: decoder_loss = tf.keras.losses.binary_crossentropy( tf.reshape(recon_obs, [-1]), tf.reshape( tf.dtypes.cast(obs_vae_input._placeholder, tf.float32), [-1])) print("here", z_mean_t.shape, z_logvar_t.shape, encoder_loss.shape, decoder_loss.shape) vae_loss = beta * encoder_loss + theta * decoder_loss outputs.append(decoder_loss) outputs.append(vae_loss) total_loss += 0.1 * tf.reduce_mean(theta * decoder_loss) if ib: ib_loss = (v_mean_t - tf.stop_gradient(tf.expand_dims( qec_input, 1)))**2 / tf.exp(v_logvar_t) + v_logvar_t print("here2", v_mean_t.shape, tf.expand_dims(qec_input, 1).shape, v_logvar_t.shape, ib_loss.shape) total_ib_loss = ib_loss + beta * encoder_loss outputs.append(total_ib_loss) total_loss += 0.1 * tf.reduce_mean(ib_loss) # EMDQN if emdqn: qec_error = q_t_selected - tf.stop_gradient(qec_input) total_loss += 0.1 * tf.reduce_mean( importance_weights_ph * U.huber_loss(qec_error)) outputs.append(qec_error) td_loss_summary = tf.summary.scalar("td loss", td_loss) total_loss_summary = tf.summary.scalar("total loss", total_loss) z_var_summary = tf.summary.scalar("z_var", tf.reduce_mean(tf.exp(z_logvar_t))) summaries = [td_loss_summary, total_loss_summary, z_var_summary] if vae or ib: encoder_loss_summary = tf.summary.scalar( "encoder loss", tf.reduce_mean(encoder_loss)) summaries.append(encoder_loss_summary) if vae: decoder_loss_summary = tf.summary.scalar( "decoder loss", tf.reduce_mean(decoder_loss)) summaries.append(decoder_loss_summary) if ib: ib_loss_summary = tf.summary.scalar("ib loss", tf.reduce_mean(ib_loss)) total_ib_loss_summary = tf.summary.scalar( "total ib loss", tf.reduce_mean(total_ib_loss)) summaries.append(ib_loss_summary) summaries.append(total_ib_loss_summary) if emdqn: qec_loss_summary = tf.summary.scalar( "qec loss", tf.reduce_mean(importance_weights_ph * qec_error)) summaries.append(qec_loss_summary) summary = tf.summary.merge(summaries) outputs.append(summary) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_loss, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=inputs, outputs=[td_error, summary], updates=[optimize_expr]) get_q_t_selected = U.function( inputs=[obs_t_input, act_t_ph, z_noise_t], outputs=q_t_selected) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input, z_noise_t], q_t) return act_f, train, update_target, { 'q_values': q_values }, get_q_t_selected
def build_train_mfmc(make_obs_ph, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, batch_size=5, scope="mfec", latent_dim=32, input_dim=84 * 84 * 4, hash_dim=32, K=10, beta=0.1, predict=True, use_rp=False, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ z_func = build_act_mfmc(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", reuse=reuse) # encoder_z_func = build_act_mfmc(make_obs_ph, model_func, num_actions, scope=scope, # secondary_scope="encoder_model_func", reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') obs_hash_input = U.ensure_tf_input(make_obs_ph("obs_hash")) obs_mc_input = U.ensure_tf_input(make_obs_ph("obs")) obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query")) # obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos")) keys_mc_input_negative = tf.placeholder(tf.float32, [None, K, latent_dim], name='enc_keys_neg') keys_mc_input_positive = tf.placeholder(tf.float32, [None, latent_dim], name='enc_keys_pos') keys_mc_input_anchor = tf.placeholder(tf.float32, [None, latent_dim], name='enc_keys_anchor') # keys_mc_input_anchor = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)), # shape=[batch_size, latent_dim], # name='enc_keys_anchor', # dtype=tf.float32) # # keys_mc_input_positive = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)), # shape=[batch_size, latent_dim], # name='enc_keys_pos', # dtype=tf.float32) # keys_mc_input_negative = tf.Variable(initial_value=np.zeros((batch_size, K, latent_dim)), # shape=[batch_size, K, latent_dim], # name='enc_keys_neg', # dtype=tf.float32) # inputs = [obs_mc_input] value_input = tf.placeholder(tf.float32, [None, 1], name='value') if predict: inputs = [ tau, obs_mc_input_query, keys_mc_input_positive, keys_mc_input_negative, keys_mc_input_anchor, obs_mc_input, value_input ] else: inputs = [ tau, obs_mc_input_query, keys_mc_input_positive, keys_mc_input_negative, keys_mc_input_anchor ] z_mc, _ = model_func(obs_mc_input_query.get(), num_actions, scope="model_func", reuse=True) _, v_mc = model_func(obs_mc_input.get(), num_actions, scope="model_func", reuse=True) # encoder_z_mc_pos, encoder_v_mc_pos = model_func( # obs_mc_input_positive.get(), num_actions, # scope="encoder_model_func", reuse=True) # z_mc_pos = tf.stop_gradient(encoder_z_mc_pos) # z_mc_pos = tf.reshape(keys_mc_input_positive, [-1, 1, latent_dim]) # z_mc_anchor = tf.reshape(z_mc, [-1, latent_dim, 1]) # z_mc_neg = tf.reshape(keys_mc_input_negative, [-1, K, latent_dim]) z_mc_pos = keys_mc_input_positive z_mc = tf.reshape(z_mc, [-1, latent_dim]) z_mc_expand = tf.reshape(z_mc, [-1, 1, latent_dim]) z_mc_tile = tf.tile(z_mc_expand, [1, K, 1]) z_mc_neg = keys_mc_input_negative z_mc_anchor = keys_mc_input_anchor anchor_dist = tf.sqrt( tf.reduce_sum(tf.square(z_mc - z_mc_anchor), axis=1)) pos_dist = tf.sqrt(tf.reduce_sum(tf.square(z_mc - z_mc_pos), axis=1)) neg_dist = tf.reduce_mean(tf.sqrt( tf.reduce_sum(tf.square(z_mc_tile - z_mc_neg), axis=2)), axis=1) # contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0)) contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0)) \ + 0.5 * tf.reduce_mean(pos_dist) + 0.5 * tf.reduce_mean(anchor_dist) pos_grad = tf.gradients([contrast_loss], [z_mc_pos]) neg_grad = tf.gradients([contrast_loss], [z_mc_neg]) # neg_grad = tf.gradients([contrast_loss],[z_mc_neg]) # negative = tf.matmul(z_mc_neg, z_mc_anchor) / tau # exp_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1)) # positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc_anchor) / tau) # print("shape:", z_mc.shape, z_mc_anchor.shape, z_mc_pos.shape, negative.shape, exp_negative.shape, # positive.shape) # contrast_loss = tf.reduce_mean(tf.log(exp_negative) - positive) # print("shape2:", z_mc.shape, negative.shape, positive.shape) prediction_loss = tf.losses.mean_squared_error(value_input, v_mc) total_loss = contrast_loss if predict: total_loss += beta * prediction_loss model_func_vars = U.scope_vars(U.absolute_scope_name("model_func")) # encoder_model_func_vars = U.scope_vars(U.absolute_scope_name("encoder_model_func")) if grad_norm_clipping is not None: optimize_expr_contrast_with_prediction = U.minimize_and_clip( optimizer, total_loss, var_list=model_func_vars, clip_val=grad_norm_clipping) else: optimize_expr_contrast_with_prediction = optimizer.minimize( total_loss, var_list=model_func_vars) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network # update_target_expr = [] # for var, var_target in zip(sorted(model_func_vars, key=lambda v: v.name), # sorted(encoder_model_func_vars, key=lambda v: v.name)): # update_target_expr.append(var_target.assign((1 - momentum) * var + momentum * var_target)) # update_target_expr = tf.group(*update_target_expr) # update_target = U.function([momentum], [], updates=[update_target_expr]) if use_rp: latten_obs = tf.reshape(obs_hash_input.get(), [-1, input_dim]) rp = tf.random.normal([input_dim, hash_dim], 0, 1 / np.sqrt(hash_dim)) obs_hash_output = tf.matmul(latten_obs, rp) else: obs_hash_output, _ = model_func(obs_hash_input.get(), num_actions, scope="hash_func", reuse=False) hash_func = U.function(inputs=[obs_hash_input], outputs=[obs_hash_output]) # EMDQN z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc, axis=1))) z_mean_summary = tf.summary.scalar( "z_mean", tf.reduce_mean(tf.math.reduce_mean(z_mc, axis=1))) negative_summary = tf.summary.scalar( "negative", tf.reduce_mean(tf.reduce_mean(neg_dist))) negative_mean_summary = tf.summary.scalar( "negative mean", tf.reduce_mean(tf.reduce_mean(z_mc_neg))) negative_grad_summary = tf.summary.scalar( "negative grad", tf.reduce_mean(tf.abs(neg_grad))) negative_var_summary = tf.summary.scalar( "negative std", tf.reduce_mean(tf.math.reduce_std(z_mc_neg, axis=2))) # negative_summary = tf.summary.scalar("negative", tf.reduce_mean(tf.reduce_mean(negative))) positive_summary = tf.summary.scalar( "positive", tf.reduce_mean(tf.reduce_mean(pos_dist))) positive_mean_summary = tf.summary.scalar( "positive mean", tf.reduce_mean(tf.reduce_mean(z_mc_pos))) positive_grad_summary = tf.summary.scalar( "positive grad", tf.reduce_mean(tf.abs(pos_grad))) positive_std_summary = tf.summary.scalar( "positive std", tf.reduce_mean(tf.math.reduce_std(z_mc_pos, axis=1))) anchor_summary = tf.summary.scalar( "anchor", tf.reduce_mean(tf.reduce_mean(anchor_dist))) # positive_summary = tf.summary.scalar("positive", tf.reduce_mean(tf.reduce_mean(positive))) # z_norm_summary = tf.summary.scalar("z_norm", tf.reduce_mean(tf.norm(z_mc, axis=1))) # encoder_z_norm_summary = tf.summary.scalar("encoder_z_norm", tf.reduce_mean(tf.norm(encoder_z_mc_pos, axis=1))) # neg_norm_summary = tf.summary.scalar("neg_z_norm", tf.reduce_mean(tf.norm(keys_mc_input_negative, axis=[1, 2]))) contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) prediction_loss_summary = tf.summary.scalar( "prediction loss", tf.reduce_mean(prediction_loss)) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) if predict: summaries = [ z_var_summary, z_mean_summary, positive_summary, negative_summary, contrast_loss_summary, prediction_loss_summary, total_loss_summary ] else: summaries = [ z_var_summary, z_mean_summary, negative_var_summary, negative_grad_summary, negative_mean_summary, positive_summary, positive_mean_summary, positive_grad_summary, positive_std_summary, negative_summary, contrast_loss_summary, anchor_summary, total_loss_summary ] summary = tf.summary.merge(summaries) train = U.function( inputs=inputs, outputs=[total_loss, summary, z_mc, pos_grad, neg_grad], updates=[optimize_expr_contrast_with_prediction]) return hash_func, z_func, train
def build_train_dbc(input_type, obs_shape, repr_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", num_neg=10, latent_dim=32, alpha=1, beta=1e2, theta=10, loss_type=["contrast"], knn=4, c_loss_type="margin", b=100, batch_size=32, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if c_loss_type != "infonce": assert num_neg == 1 # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN # tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name), magic_num = tf.get_variable(name='magic', shape=[1]) obs_input_u = U.ensure_tf_input( input_type(obs_shape, None, name="obs_u")) obs_input_u_tp1 = U.ensure_tf_input( input_type(obs_shape, None, name="obs_u_tp1")) obs_input_v = U.ensure_tf_input( input_type(obs_shape, None, name="obs_v")) action_input = tf.placeholder(tf.int32, [batch_size], name="action") reward_input = tf.placeholder(tf.float32, [batch_size], name="action") inputs = [ obs_input_u, obs_input_u_tp1, obs_input_v, action_input, reward_input ] z_old = repr_func(obs_input_u.get(), num_actions, scope="target_repr_func", reuse=False) z_u = repr_func(obs_input_u.get(), num_actions, scope="repr_func", reuse=tf.AUTO_REUSE) z_u_tp1 = repr_func(obs_input_u_tp1.get(), num_actions, scope="repr_func", reuse=tf.AUTO_REUSE) z_v = repr_func(obs_input_v.get(), num_actions, scope="repr_func", reuse=tf.AUTO_REUSE) z_u_tp1_predict, r_u_predict = model_func(z_u, num_actions, scope="model_func", reuse=tf.AUTO_REUSE) z_v_tp1_predict, r_v_predict = model_func(z_v, num_actions, scope="model_func", reuse=tf.AUTO_REUSE) # total_loss = 0 # reprsentation loss dist_bisimulation = tf.reduce_max( tf.abs(r_u_predict - r_v_predict) + gamma * tf.reduce_sum( tf.square(z_u_tp1_predict - z_v_tp1_predict), axis=2), axis=1) dist_bisimulation = tf.stop_gradient(dist_bisimulation) repr_loss = tf.losses.mean_squared_error( tf.norm(z_u - z_v, ord=1, axis=1), dist_bisimulation) # model loss z_u_tp1_selected = tf.gather(z_u_tp1_predict, action_input, axis=1, batch_dims=0) r_u_selected = tf.gather(r_u_predict, action_input, axis=1, batch_dims=0) transition_loss = tf.losses.mean_squared_error( z_u_tp1, tf.stop_gradient(z_u_tp1_selected)) reward_loss = tf.losses.mean_squared_error( reward_input, tf.stop_gradient(r_u_selected)) model_loss = transition_loss + reward_loss total_loss = repr_loss + alpha * model_loss model_func_vars = U.scope_vars(U.absolute_scope_name("repr_func")) model_func_vars_update = copy.copy(model_func_vars) + U.scope_vars( U.absolute_scope_name("model_func")) target_model_func_vars = U.scope_vars( U.absolute_scope_name("repr_model_func")) update_target_expr = [] for var in model_func_vars: print(var.name, var.shape) for var_target in target_model_func_vars: print(var_target.name, var_target.shape) for var, var_target in zip( sorted(model_func_vars, key=lambda v: v.name), sorted(target_model_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip( optimizer, total_loss, var_list=model_func_vars_update, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=model_func_vars_update) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z_u, axis=1))) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) transition_loss_summary = tf.summary.scalar( "transition loss", tf.reduce_mean(transition_loss)) reward_loss_summary = tf.summary.scalar("reward loss", tf.reduce_mean(reward_loss)) model_loss_summary = tf.summary.scalar("model loss", tf.reduce_mean(model_loss)) repr_loss_summary = tf.summary.scalar("repr loss", tf.reduce_mean(repr_loss)) summaries = [ z_var_summary, total_loss_summary, transition_loss_summary, reward_loss_summary, model_loss_summary, repr_loss_summary ] summary = tf.summary.merge(summaries) outputs = [total_loss, summary] train = U.function(inputs=inputs, outputs=outputs, updates=[optimize_expr]) eval = U.function(inputs=inputs, outputs=outputs, updates=[]) z_func = U.function( inputs=[obs_input_u], outputs=[z_old], ) update_target_func = U.function([], [], updates=[update_target_expr]) return z_func, train, eval, update_target_func
def build_train(make_obs_ph, q_func, num_actions, optimizer, chief=False, server=None, workers=1, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. chief: bool whether or not the worker should assume chief duties. these include: initializing global parameters, tensorboarding, saving, etc. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ task = server.server_def.task_index act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, task=task) with tf.variable_scope(scope, reuse=reuse): with tf.device("/job:worker/task:{}".format(task)): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Local timestep counters t = tf.placeholder(tf.float32, [1], name="t") t_global_old = tf.placeholder(tf.float32, [1], name="t_global_old") score_input = tf.placeholder(tf.float32, [1], name="score_input") grad_prio = tf.placeholder(tf.bool, [1], name="grad_prio") converged_ph = tf.placeholder(tf.bool, [1], name="converged") factor_input = tf.placeholder(tf.float32, [1], name="factor_input") # Global timestep counter # TODO Does TF have built-in global step counters? with tf.device("/job:ps/task:0"): t_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="t_global") run_code_global = tf.Variable(initial_value="", name="run_code_global") comm_rounds_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="comm_rounds_global") max_workers_global = tf.constant(workers, dtype=tf.float32, name="max_workers_global") worker_count_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="worker_count_global") score_max_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="score_max_global") score_min_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="score_min_global") submit_count_global = tf.Variable(dtype=tf.float32, initial_value=[-1], name="submit_count_global") converged_global = tf.Variable(dtype=tf.bool, initial_value=[False], name="converged_global") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # global weights print("chief:", chief, "reuse:", True if not chief else None) global_q_func_vars = [] # with tf.device(tf.train.replica_device_setter(cluster=cluster)): with tf.device( "/job:ps/task:0"): # TODO needs RDS if using multiple PS # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights", reuse=None if chief else True)#reuse=(not chief)) # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights") with tf.variable_scope("global_weights"): for var in q_func_vars: name = var.name.split(":")[0].split("q_func/")[-1] global_q_func_vars.append( tf.get_variable(name=name, shape=var.shape, dtype=var.dtype, initializer=tf.contrib.layers. xavier_initializer( seed=1, dtype=var.dtype))) # global_q_func_vars = U.scope_vars(U.absolute_scope_name("global_weights")) # print("Global:", global_q_func_vars) # old weights (used to implicitly calculate gradient sum: q_func_vars - q_func_vars_old) q_func_vars_old = [] with tf.variable_scope("old_weights"): for var in q_func_vars: name = var.name.split(":")[0].split("q_func/")[-1] q_func_vars_old.append( tf.get_variable( name=name, shape=var.shape, dtype=var.dtype, initializer=tf.contrib.layers.xavier_initializer( seed=1, dtype=var.dtype))) # q_old = q_func(obs_t_input.get(), num_actions, scope="old_weights") # q_func_vars_old = U.scope_vars(U.absolute_scope_name("old_weights")) # print("Old vars:", q_func_vars_old) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max( q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip( optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_global_fn will be called periodically to copy global Q network to q network update_global_expr = [] for var_global, var, var_old in zip( sorted(global_q_func_vars, key=lambda v: v.name), sorted(q_func_vars, key=lambda v: v.name), sorted(q_func_vars_old, key=lambda v: v.name)): update_global_expr.append(var.assign(var_global)) # TODO Can async cause var <- var_global, var_global <- new value, var_old <- var_global in that order? # TODO Should this copy from var instead? (concurrency issues?) # TODO Can concurrency cause var_old <- var, var <- var_global in that order (resulting in wrong values)? # TODO Safest method is to force sequential execution of var <- var_global, var_old <- var! How though? update_global_expr.append(var_old.assign(var_global)) update_global_expr = tf.group(*update_global_expr) # update the global time step counter by adding the local update_t_global = t_global.assign_add(t) optimize_global_expr = [] # Factor to multiply every gradient with # f = t / (t_global - t_global_old) dt = tf.subtract(update_t_global, t_global_old) factor = tf.where( tf.greater_equal(factor_input, 0), factor_input, tf.where( grad_prio, tf.divide(tf.subtract(score_input, score_min_global), tf.subtract(score_max_global, score_min_global)), tf.div(t, dt))) for var, var_old, var_global in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(q_func_vars_old, key=lambda v: v.name), sorted(global_q_func_vars, key=lambda v: v.name)): # Multiply the difference between the old parameters and the locally optimized parameters # g = (var - var_old) * f grad = tf.multiply(tf.subtract(var, var_old), factor) optimize_global_expr.append(var_global.assign_add(grad)) optimize_global_expr = tf.group(*optimize_global_expr) # if cr == cr_g and wc < wc_max: # wc += 1 # score_global += score # if cr == cr_g and wc == wc_max: # vc += 1 # score_global += score # cr_g += 0.5 # return cr_g """ if cr == cr_g: if wc <= wc_max: wc += 1 score_global += score if wc == wc_max: cr_g += 0.5 return cr_g """ # submit_score_expr = \ # tf.cond(tf.equal(comm_rounds, comm_rounds_global), # lambda: tf.cond(tf.less_equal(worker_count_global, max_workers_global), # lambda: tf.group(worker_count_global.assign_add([1]), # score_global.assign_add(score_input), # tf.cond(tf.equal(worker_count_global, max_workers_global), # lambda: comm_rounds_global.assign_add([0.5]), # lambda: None)), # lambda: tf.group(None, None, None)), # lambda: None) # submit_score_expr = \ # tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global), # tf.less(worker_count_global, max_workers_global)), # tf.group(worker_count_global.assign_add(1), # score_global.assign_add(score_input)), # tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global), # tf.equal(worker_count_global, max_workers_global)), # tf.group(worker_count_global.assign_add(1), # score_global.assign_add(score_input), # comm_rounds_global.assign_add(0.5)))) # This makes a sum of all scores ( # submit_score_expr = score_global.assign_add(score_input) # This only saves the maximum score (for normalized score weighting) submit_score_max = score_max_global.assign(tf.maximum( score_input, score_max_global), use_locking=True) submit_score_min = score_min_global.assign(tf.minimum( score_input, score_min_global), use_locking=True) set_submit_count = submit_count_global.assign(score_input, use_locking=True) inc_submit_count = submit_count_global.assign_add([1], use_locking=True) # check_round_op = tf.equal(comm_rounds, comm_rounds_global) # Not used anymore inc_wc = worker_count_global.assign_add([1], use_locking=True) zero_wc = worker_count_global.assign([0], use_locking=True) inc_cr = comm_rounds_global.assign_add([1], use_locking=True) score_reset = score_max_global.assign([0], use_locking=True) converged_set = converged_global.assign(converged_ph, use_locking=True) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error], updates=[optimize_expr]) global_opt = U.function( inputs=[t, t_global_old, score_input, factor_input, grad_prio], outputs=[dt, comm_rounds_global, factor], updates=[optimize_global_expr]) # global_sync_opt = U.function(inputs=[comm_rounds], outputs=[comm_rounds_global], updates=[optimize_global_sync_expr]) update_weights = U.function(inputs=[], outputs=[t_global], updates=[update_global_expr]) update_target = U.function([], [], updates=[update_target_expr]) submit_score = U.function( inputs=[score_input], outputs=[comm_rounds_global], updates=[submit_score_max, submit_score_min]) check_round = U.function(inputs=[], outputs=[comm_rounds_global], updates=[]) request_submit = U.function(inputs=[], outputs=[comm_rounds_global, inc_wc], updates=[]) set_submit = U.function(inputs=[score_input], outputs=[set_submit_count], updates=[]) check_submit = U.function(inputs=[], outputs=[submit_count_global], updates=[]) inc_submit = U.function(inputs=[], outputs=[inc_submit_count], updates=[]) inc_comm_round = U.function(inputs=[], outputs=[inc_cr], updates=[]) reset_wc = U.function(inputs=[], outputs=[zero_wc], updates=[]) check_wc = U.function(inputs=[], outputs=[worker_count_global], updates=[]) reset_score = U.function(inputs=[], outputs=[], updates=[score_reset]) set_converged = U.function(inputs=[converged_ph], outputs=[], updates=[converged_set]) check_converged = U.function(inputs=[], outputs=[converged_global], updates=[]) # Debugging functions q_values = U.function([obs_t_input], q_t) weights = U.function( inputs=[], outputs=[q_func_vars, global_q_func_vars, q_func_vars_old], updates=[]) t_global_func = U.function([], t_global) comm_rounds_func = U.function([], comm_rounds_global) return act_f, train, global_opt, update_target, update_weights, \ {'request_submit': request_submit, 'submit_score': submit_score, 'check_round': check_round, 'check_submit': check_submit, 'set_submit': set_submit, 'inc_submit': inc_submit, 'inc_comm_round': inc_comm_round, 'reset_wc': reset_wc, 'check_wc': check_wc, 'reset_score': reset_score, 'set_converged': set_converged, 'check_converged': check_converged}, \ {'q_values': q_values, 'weights': weights, 't_global': t_global_func, 'run_code': run_code_global, 'comm_rounds': comm_rounds_func, 'factor': factor}
def build_train(make_obs_ph, q_func, num_actions, optimizer, bootstrap=False, swarm=False, voting=False, heads=1, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, device="/cpu:0"): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, bootstrap=bootstrap, swarm=swarm, voting=voting, heads=heads, num_actions=num_actions, scope=scope, reuse=reuse, device=device) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") update_lr_ph = tf.placeholder(tf.float32, (), name="learning_rate") lr = tf.get_variable("lr", (), initializer=tf.constant_initializer(0)) with tf.device(device): # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True, heads=heads) # reuse parameters form act target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = [] for i in range(heads): q_t_selected.append( tf.reduce_sum(q_t[i] * tf.one_hot(act_t_ph, num_actions), 1)) # compute estimate of best possible value starting from state at t + 1 q_tp1_best = [] q_tp1_best_using_online_net = [] if swarm: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) action_subsets = [] for i in range(heads): target_greedy_action = tf.argmax(q_tp1[i], axis=1) online_q_value_threshold = tf.reduce_sum( q_tp1_using_online_net[i] * tf.one_hot(target_greedy_action, num_actions), 1) online_q_value_threshold = tf.tile( tf.expand_dims(online_q_value_threshold, 1), tf.constant([1, num_actions])) action_subset = tf.where( (q_tp1_using_online_net[i] - online_q_value_threshold) >= 0, tf.ones([tf.shape(obs_t_input.get())[0], num_actions]), tf.zeros([tf.shape(obs_t_input.get())[0], num_actions])) action_subsets.append(action_subset) action_subsets = tf.stack(action_subsets, axis=1) actions_cover = set_cover(action_subsets) # preferred_actions = tf.transpose(action_subsets, [1, 0, 2]) for i in range(heads): q_tp1_best_using_online_net.append( tf.argmax(tf.multiply(actions_cover, q_tp1[i]), axis=1)) q_tp1_best.append( tf.reduce_sum( q_tp1[i] * tf.one_hot( q_tp1_best_using_online_net[i], num_actions), 1)) elif double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) for i in range(heads): q_tp1_best_using_online_net.append( tf.arg_max(q_tp1_using_online_net[i], 1)) q_tp1_best.append( tf.reduce_sum( q_tp1[i] * tf.one_hot( q_tp1_best_using_online_net[i], num_actions), 1)) else: for i in range(heads): q_tp1_best.append(tf.reduce_max(q_tp1, 1)) q_tp1_best_masked = [] q_t_selected_target = [] td_error = [] errors = [] weighted_error = [] optimize_expr = [] optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.99, epsilon=1e-4) q_func_heads = U.scope_vars(U.absolute_scope_name("q_func/heads")) q_func_convnets = U.scope_vars(U.absolute_scope_name("q_func/convnet")) for i in range(heads): q_tp1_best_masked.append((1.0 - done_mask_ph) * q_tp1_best[i]) # compute RHS of bellman equation q_t_selected_target.append(rew_t_ph + gamma * q_tp1_best_masked[i]) # compute the error (potentially clipped) td_error.append(q_t_selected[i] - tf.stop_gradient(q_t_selected_target[i])) with tf.device(device): errors.append(U.huber_loss(td_error[i])) weighted_error.append( tf.reduce_mean(importance_weights_ph * errors[i])) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr.append( U.minimize_and_clip(optimizer, weighted_error[i], var_list=q_func_heads, clip_val=grad_norm_clipping)) optimize_expr.append( U.minimize_and_clip(optimizer, 0.1 * weighted_error[i], var_list=q_func_convnets, clip_val=grad_norm_clipping)) else: optimize_expr.append( optimizer.minimize(weighted_error[i], var_list=q_func_vars)) update_lr_expr = lr.assign( tf.cond(update_lr_ph >= 0, lambda: update_lr_ph, lambda: lr)) optimize_expr.append(update_lr_expr) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, update_lr_ph ], outputs=td_error[0], updates=optimize_expr, ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train_mf(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", alpha=1.0, beta=1.0, theta=1.0, latent_dim=32, ib=True, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_noise = tf.placeholder(tf.float32, [None, latent_dim], name="act_noise") act_f = build_act_mf(make_obs_ph, q_func, act_noise, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae")) z_noise_vae = tf.placeholder(tf.float32, [None, latent_dim], name="z_noise_vae") inputs = [obs_vae_input, z_noise_vae] if ib: qec_input = tf.placeholder(tf.float32, [None], name='qec') inputs.append(qec_input) outputs = [] q_vae, q_deterministic_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = q_func( obs_vae_input.get(), z_noise_vae, num_actions, scope="q_func", reuse=True) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) encoder_loss = -1 + z_mean_vae**2 + tf.exp(z_logvar_vae) - z_logvar_vae total_loss = tf.reduce_mean(beta * encoder_loss) decoder_loss = tf.keras.losses.binary_crossentropy( tf.reshape(recon_obs, [-1]), tf.reshape(tf.dtypes.cast(obs_vae_input._placeholder, tf.float32), [-1])) print("here", z_mean_vae.shape, z_logvar_vae.shape, encoder_loss.shape, decoder_loss.shape) vae_loss = beta * encoder_loss + theta * decoder_loss outputs.append(encoder_loss) outputs.append(decoder_loss) outputs.append(vae_loss) total_loss += tf.reduce_mean(theta * decoder_loss) if ib: ib_loss = (v_mean_vae - tf.stop_gradient(tf.expand_dims(qec_input, 1)) )**2 / tf.exp(v_logvar_vae) + v_logvar_vae print("here2", v_mean_vae.shape, tf.expand_dims(qec_input, 1).shape, v_logvar_vae.shape, ib_loss.shape) total_ib_loss = alpha * ib_loss + beta * encoder_loss outputs.append(total_ib_loss) total_loss += tf.reduce_mean(alpha * ib_loss) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_loss, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=q_func_vars) # Create callable functions # EMDQN total_loss_summary = tf.summary.scalar("total loss", total_loss) z_var_summary = tf.summary.scalar("z_var", tf.reduce_mean(tf.exp(z_logvar_vae))) encoder_loss_summary = tf.summary.scalar("encoder loss", tf.reduce_mean(encoder_loss)) decoder_loss_summary = tf.summary.scalar("decoder loss", tf.reduce_mean(decoder_loss)) summaries = [ total_loss_summary, z_var_summary, encoder_loss_summary, decoder_loss_summary ] if ib: ib_loss_summary = tf.summary.scalar("ib loss", tf.reduce_mean(ib_loss)) total_ib_loss_summary = tf.summary.scalar( "total ib loss", tf.reduce_mean(total_ib_loss)) summaries.append(ib_loss_summary) summaries.append(total_ib_loss_summary) summary = tf.summary.merge(summaries) outputs.append(summary) train = U.function(inputs=inputs, outputs=[total_loss, summary], updates=[optimize_expr]) return act_f, train
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, p_dist_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="distdeepq", reuse=None, param_noise=False, param_noise_filter_func=None, dist_params=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: raise ValueError('parameter noise not supported') else: act_f = build_act(make_obs_ph, p_dist_func, num_actions, dist_params, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # ===================================================================================== # q network evaluation p_t = p_dist_func(obs_t_input.get(), num_actions, dist_params['nb_atoms'], scope="q_func", reuse=True) # reuse parameters from act q_t = p_to_q(p_t, dist_params) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution p_tp1 = p_dist_func(obs_tp1_input.get(), num_actions, dist_params['nb_atoms'], scope="target_q_func") q_tp1 = p_to_q(p_tp1, dist_params) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # TODO: use double a_next = tf.argmax(q_tp1, 1, output_type=tf.int32) batch_dim = tf.shape(rew_t_ph)[0] ThTz, debug = build_categorical_alg(p_tp1, rew_t_ph, a_next, gamma, batch_dim, done_mask_ph, dist_params) # compute the error (potentially clipped) cat_idx = tf.transpose( tf.reshape(tf.concat([tf.range(batch_dim), act_t_ph], axis=0), [2, batch_dim])) p_t_next = tf.gather_nd(p_t, cat_idx) cross_entropy = -1 * ThTz * tf.log(p_t_next) errors = tf.reduce_sum(cross_entropy, axis=-1) mean_error = tf.reduce_mean(errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, mean_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(mean_error, var_list=q_func_vars) # ===================================================================================== # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=errors, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, { **debug, 'q_values': q_values, 'p': p_tp1, 'cross_entropy': cross_entropy, 'ThTz': ThTz }
def build_train_contrast(make_obs_ph, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", latent_dim=32, alpha=0.05, beta=0.1, theta=0.1, loss_type=["contrast"], c_loss_type="sqmargin", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN # tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') obs_input_query = U.ensure_tf_input(make_obs_ph("obs_query")) obs_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos")) obs_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg")) value_input_query = tf.placeholder(tf.float32, [None], name="value") action_embedding = tf.Variable(tf.random_normal( [num_actions, latent_dim], stddev=1), name="action_embedding") action_input = tf.placeholder(tf.int32, [None], name="action") inputs = [obs_input_query] if "contrast" in loss_type: inputs += [obs_input_positive, obs_input_negative] if "regression" in loss_type: inputs += [value_input_query] if "linear_model" in loss_type: inputs += [action_input] if "contrast" not in loss_type: inputs += [obs_input_positive] z = model_func(obs_input_query.get(), num_actions, scope="model_func", reuse=tf.AUTO_REUSE) h = model_func(obs_input_query.get(), num_actions, scope="hash_func", reuse=False) # _, v = model_func( # obs_input_query.get(), num_actions, # scope="model_func", # reuse=True) z_pos = model_func(obs_input_positive.get(), num_actions, scope="model_func", reuse=True) z_neg = model_func(obs_input_negative.get(), num_actions, scope="model_func", reuse=True) z_pos = tf.reshape(z_pos, [-1, latent_dim]) z_tar = tf.reshape(z, [-1, latent_dim]) z_neg = tf.reshape(z_neg, [-1, latent_dim]) contrast_loss = contrastive_loss_fc(z_tar, z_pos, z_neg, c_type=c_loss_type) regression_loss = tf.reduce_mean( tf.squared_difference(tf.norm(z_tar, axis=1), alpha * value_input_query)) action_embeded = tf.matmul(tf.one_hot(action_input, num_actions), action_embedding) model_loss = tf.reduce_mean( tf.squared_difference(action_embeded + z_tar, z_pos)) print("shape:", z_tar.shape, z_pos.shape, z_neg.shape, action_embeded.shape) # contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive) # print("shape2:", z.shape, negative.shape, positive.shape) # prediction_loss = tf.losses.mean_squared_error(value_input, v) total_loss = 0 if "contrast" in loss_type: total_loss += contrast_loss if "regression" in loss_type: total_loss += beta * regression_loss elif "linear_model" in loss_type: total_loss += theta * model_loss model_func_vars = U.scope_vars(U.absolute_scope_name("model_func")) if "linear_model" in loss_type: model_func_vars.append(action_embedding) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_loss, var_list=model_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=model_func_vars) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1))) negative_summary = tf.summary.scalar( "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg))) positive_summary = tf.summary.scalar( "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos))) contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) regression_loss_summary = tf.summary.scalar( "regression loss", tf.reduce_mean(contrast_loss)) model_loss_summary = tf.summary.scalar("model loss", tf.reduce_mean(contrast_loss)) # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss)) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) summaries = [z_var_summary, total_loss_summary] if "contrast" in loss_type: summaries += [ negative_summary, positive_summary, contrast_loss_summary ] if "regression" in loss_type: summaries.append(regression_loss_summary) if "linear_model" in loss_type: summaries.append(model_loss_summary) summary = tf.summary.merge(summaries) outputs = [z_tar] if "contrast" in loss_type: outputs += [z_pos, z_neg] elif "linear_model" in loss_type: outputs += [z_pos] outputs += [total_loss, summary] train = U.function(inputs=inputs, outputs=outputs, updates=[optimize_expr]) eval = U.function(inputs=inputs, outputs=outputs, updates=[]) z_func = U.function( inputs=[obs_input_query], outputs=[z, h], ) norm_func = U.function(inputs=[obs_input_query], outputs=[tf.norm(z_tar, axis=1)]) return z_func, train, eval, norm_func
def build_train_mer(input_type, obs_shape, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", num_neg=10, latent_dim=32, alpha=0.1, beta=1e2, theta=10, loss_type=["contrast"], knn=4, c_loss_type="margin", b=100, batch_size=32, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if c_loss_type != "infonce": assert num_neg == 1 # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN # tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name), magic_num = tf.get_variable(name='magic', shape=[1]) obs_input_query = U.ensure_tf_input( input_type(obs_shape, None, name="obs_query")) obs_input_positive = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_pos")) obs_input_negative = U.ensure_tf_input( input_type(obs_shape, batch_size * num_neg, name="enc_obs_neg")) obs_input_neighbour = U.ensure_tf_input( input_type(obs_shape, batch_size * knn, name="enc_obs_neighbour")) obs_input_uniformity_u = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_uni_u")) obs_input_uniformity_v = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_uni_v")) obs_input_weighted_product_u = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_wp_u")) obs_input_weighted_product_v = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_wp_v")) value_input_weighted_product_u = tf.placeholder(tf.float32, [batch_size], name="value_u") value_input_weighted_product_v = tf.placeholder(tf.float32, [batch_size], name="value_v") value_input_query = tf.placeholder(tf.float32, [batch_size], name="value") value_input_neighbour = tf.placeholder(tf.float32, [batch_size, knn], name="neighbour_value") action_embedding = tf.Variable(tf.random_normal( [num_actions, latent_dim], stddev=1), name="action_embedding") action_input = tf.placeholder(tf.int32, [batch_size], name="action") action_input_causal = tf.placeholder(tf.int32, [batch_size], name="action") reward_input_causal = tf.placeholder(tf.float32, [batch_size], name="action") inputs = [obs_input_query] if "contrast" in loss_type: inputs += [obs_input_positive, obs_input_negative] if "regression" in loss_type: inputs += [value_input_query] if "linear_model" in loss_type: inputs += [action_input] if "contrast" not in loss_type: inputs += [obs_input_positive] if "fit" in loss_type: # if "contrast" not in loss_type: # inputs+=[] inputs += [obs_input_neighbour, value_input_neighbour] if "regression" not in loss_type: inputs += [value_input_query] if "weight_product" in loss_type: inputs += [ obs_input_uniformity_u, obs_input_uniformity_v, obs_input_weighted_product_u, obs_input_weighted_product_v, value_input_weighted_product_u, value_input_weighted_product_v ] if "causality" in loss_type: inputs += [reward_input_causal, action_input_causal] z_old = model_func(obs_input_query.get(), num_actions, scope="target_model_func", reuse=False) z = model_func(obs_input_query.get(), num_actions, scope="model_func", reuse=tf.AUTO_REUSE) z_pos = model_func(obs_input_positive.get(), num_actions, scope="model_func", reuse=True) z_neg = model_func(obs_input_negative.get(), num_actions, scope="model_func", reuse=True) z_uni_u = model_func(obs_input_uniformity_u.get(), num_actions, scope="model_func", reuse=True) z_uni_v = model_func(obs_input_uniformity_v.get(), num_actions, scope="model_func", reuse=True) z_wp_u = model_func(obs_input_weighted_product_u.get(), num_actions, scope="model_func", reuse=True) z_wp_v = model_func(obs_input_weighted_product_v.get(), num_actions, scope="model_func", reuse=True) z_pos = tf.reshape(z_pos, [-1, latent_dim]) z_tar = tf.reshape(z, [-1, latent_dim]) if "contrast" in loss_type: z_neg = tf.reshape(z_neg, [-1, latent_dim]) contrast_loss, contrast_summary = contrastive_loss_fc( z_tar, z_pos, z_neg, c_type=c_loss_type, num_neg=num_neg, batch_size=batch_size, emb_dim=latent_dim) symmetry_loss, symmetry_summary = contrastive_loss_fc( z_pos, z_tar, z_neg, c_type=c_loss_type, num_neg=num_neg, batch_size=batch_size, emb_dim=latent_dim) contrast_loss += symmetry_loss z_neighbour = model_func(obs_input_neighbour.get(), num_actions, scope="model_func", reuse=True) # fit loss z_neighbour = tf.reshape(z_neighbour, [-1, knn, latent_dim]) square_dist = tf.square( tf.tile(tf.expand_dims(z_tar, 1), [1, knn, 1]) - z_neighbour) neighbour_dist = tf.reduce_sum(square_dist, axis=2) neighbour_coeff = tf.math.softmax(-neighbour_dist / b, axis=1) coeff_sum = tf.reduce_mean(tf.reduce_sum(neighbour_coeff, axis=1)) value_input_neighbour_mean = tf.reduce_mean(value_input_neighbour) fit_value = tf.reduce_sum(tf.multiply(neighbour_coeff, value_input_neighbour), axis=1) fit_loss = tf.reduce_mean(tf.abs(fit_value - value_input_query)) # causality loss reward_input_causal = tf.reshape(reward_input_causal, [1, -1]) reward_tile = tf.tile(reward_input_causal, [batch_size, 1]) # reward_mask = (reward_tile - tf.transpose(reward_tile)) ** 2 reward_mask = 1 - tf.cast( tf.equal((reward_tile - tf.transpose(reward_tile)), tf.constant(0.)), tf.float32) action_input_causal = tf.reshape(action_input_causal, [1, -1]) action_tile = tf.tile(action_input_causal, [batch_size, 1]) action_mask = tf.cast( tf.equal((action_tile - tf.transpose(action_tile)), tf.constant(0)), tf.float32) total_mask = tf.multiply(reward_mask, action_mask) z_tile = tf.tile(tf.expand_dims(z_tar, 1), [1, batch_size, 1]) z_diff = z_tile - tf.transpose(z_tile, perm=[1, 0, 2]) distance = tf.reduce_sum(z_diff**2, axis=2) exp_distance = tf.exp(-distance) causal_find_rate = (tf.reduce_sum(total_mask)) / (batch_size**2 - batch_size) causal_loss = tf.reduce_sum(tf.multiply(exp_distance, total_mask)) # regularization loss regularization_loss = -tf.maximum( 1., tf.reduce_mean(U.huber_loss(z_tar, 0.01))) regression_loss = tf.reduce_mean( tf.squared_difference(tf.norm(z_tar, axis=1), alpha * value_input_query)) + regularization_loss # linear model loss action_embeded = tf.matmul(tf.one_hot(action_input, num_actions), action_embedding) model_loss = tf.reduce_mean( tf.squared_difference(action_embeded + z_tar, z_pos)) + 0.01 * regularization_loss # weighted product loss uniformity_loss = tf.reduce_sum( tf.exp(2 * tf.reduce_sum(tf.multiply(z_uni_u, z_uni_v), axis=1) - 2)) value_weight = (value_input_weighted_product_u - value_input_weighted_product_v)**2 # angle = acos_safe(tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1)) angle = tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1) weighted_product = tf.multiply(value_weight, angle) wp_loss = tf.reduce_sum(weighted_product) total_loss = 0 if "contrast" in loss_type: total_loss += contrast_loss if "regression" in loss_type: total_loss += beta * regression_loss if "linear_model" in loss_type: total_loss += theta * model_loss if "fit" in loss_type: total_loss += beta * fit_loss if "causality" in loss_type: total_loss += theta * causal_loss if "weight_product" in loss_type: total_loss += 0.1 * uniformity_loss total_loss += wp_loss model_func_vars = U.scope_vars(U.absolute_scope_name("model_func")) model_func_vars_update = copy.copy(model_func_vars) if "linear_model" in loss_type: model_func_vars_update.append(action_embedding) target_model_func_vars = U.scope_vars( U.absolute_scope_name("target_model_func")) update_target_expr = [] for var in model_func_vars: print(var.name, var.shape) for var_target in target_model_func_vars: print(var_target.name, var_target.shape) for var, var_target in zip( sorted(model_func_vars, key=lambda v: v.name), sorted(target_model_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip( optimizer, total_loss, var_list=model_func_vars_update, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=model_func_vars_update) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1))) if "contrast" in loss_type: z_neg = tf.reshape(z_neg, [batch_size, num_neg, latent_dim]) negative_summary = tf.summary.scalar( "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg[:, 0, :]))) positive_summary = tf.summary.scalar( "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos))) if "contrast" in loss_type: contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) regularization_loss_summary = tf.summary.scalar( "regularization loss", tf.reduce_mean(regularization_loss)) regression_loss_summary = tf.summary.scalar( "regression loss", tf.reduce_mean(regression_loss)) model_loss_summary = tf.summary.scalar("model loss", tf.reduce_mean(model_loss)) fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss)) fit_value_summary = tf.summary.scalar("fit value", tf.reduce_mean(fit_value)) neighbour_value_summary = tf.summary.scalar( "neighbour value", value_input_neighbour_mean) coeff_summary = tf.summary.scalar("coeff sum", coeff_sum) square_dist_summary = tf.summary.scalar("square_dist", tf.reduce_mean(square_dist)) z_neighbour_summary = tf.summary.scalar("z_neighbour_mean", tf.reduce_mean(z_neighbour)) # fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss)) # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss)) causal_efficiency_summary = tf.summary.scalar("causal efficiency", causal_find_rate) causal_loss_summary = tf.summary.scalar("causal loss", causal_loss) # reward_mask_summary = tf.summary.scalar("reward mask summary", debug_reward_mask) # action_mask_summary = tf.summary.scalar("action mask summary", debug_action_mask) uniformity_loss_summary = tf.summary.scalar("uniform loss", uniformity_loss) wp_loss_summary = tf.summary.scalar("weighted product loss", wp_loss) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) summaries = [ z_var_summary, total_loss_summary, regularization_loss_summary ] if "contrast" in loss_type: summaries += [ negative_summary, positive_summary, contrast_loss_summary ] summaries += contrast_summary if "regression" in loss_type: summaries.append(regression_loss_summary) if "linear_model" in loss_type: summaries.append(model_loss_summary) if "contrast" not in loss_type: summaries.append(positive_summary) if "fit" in loss_type: summaries.append(fit_loss_summary) summaries.append(fit_value_summary) summaries.append(neighbour_value_summary) summaries.append(coeff_summary) summaries.append(square_dist_summary) summaries.append(z_neighbour_summary) if "causality" in loss_type: summaries.append(causal_efficiency_summary) summaries.append(causal_loss_summary) # summaries.append(reward_mask_summary) # summaries.append(action_mask_summary) if "weight_product" in loss_type: summaries.append(uniformity_loss_summary) summaries.append(wp_loss_summary) summary = tf.summary.merge(summaries) outputs = [total_loss, summary] train = U.function(inputs=inputs, outputs=outputs, updates=[optimize_expr]) eval = U.function(inputs=inputs, outputs=outputs, updates=[]) z_func = U.function( inputs=[obs_input_query], outputs=[z_old], ) norm_func = U.function(inputs=[obs_input_query], outputs=[tf.norm(z_tar, axis=1)]) update_target_func = U.function([], [], updates=[update_target_expr]) return z_func, train, eval, norm_func, update_target_func
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") if imitate: imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action") # EMDQN value_t_ph = tf.placeholder(tf.float32, [None], name='value_t') value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1') value_tp1_masked = (1.0 - done_mask_ph) * value_tp1_ph # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act # q_t_normalized = q_t - tf.max(q_t,) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute RHS of bellman equation q_target = rew_t_ph + gamma * value_tp1_masked # compute the error (potentially clipped) td_error = q_target - (q_t_selected + value_t_ph) td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error)) # EMDQN print(q_t.shape) if imitate: imitation_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=imitate_act_t_ph, logits=q_t), axis=1) print(imitation_loss.shape) errors = U.huber_loss(td_error) + imitation_loss else: errors = U.huber_loss(td_error) total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors)) value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph)) value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph)) q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected)) summaries=[td_summary, total_summary, value_summary, value_tp1_summary, q_summary] if imitate: imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss)) summaries.append(imitate_summary) summary = tf.summary.merge(summaries) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network inputs = [ obs_t_input, act_t_ph, rew_t_ph, done_mask_ph, importance_weights_ph, value_t_ph, value_tp1_ph ] if imitate: inputs.append(imitate_act_t_ph) # Create callable functions # EMDQN train = U.function( inputs=inputs, outputs=[td_error, summary], updates=[optimize_expr] ) return act_f, train
def build_train(make_obs_ph, q_func, num_actions, grad_norm_clipping=None, gamma=1.0, deterministic_filter=False, random_filter=False, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func, deterministic_filter=deterministic_filter, random_filter=random_filter) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, deterministic_filter=deterministic_filter, random_filter=random_filter) with tf.variable_scope(scope, reuse=reuse): # set up placeholders lr_ph = tf.placeholder(tf.float32, name="lr") obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(U.data_type, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(U.data_type, [None], name="done") importance_weights_ph = tf.placeholder(U.data_type, [None], name="weight") board_size = obs_t_input.get().get_shape().as_list()[1] obs_t = transform_obses(obs_t_input.get()) obs_tp1 = transform_obses(obs_tp1_input.get()) act_t = transform_actions(act_t_ph, board_size) if deterministic_filter: invalid_masks_tp1 = build_invalid_masks(obs_tp1) # q network evaluation q_t = q_func(obs_t, num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1, num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t, num_actions, dtype=U.data_type), axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1, num_actions, scope="q_func", reuse=True) if deterministic_filter: q_tp1_using_online_net = build_q_filter( q_tp1_using_online_net, invalid_masks_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1, output_type=U.index_type) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions, dtype=U.data_type), 1) else: if deterministic_filter: q_tp1 = build_q_filter(q_tp1, invalid_masks_tp1) q_tp1_best = tf.reduce_max(q_tp1, axis=1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) weighted_error = tf.reduce_mean(importance_weights_ph * U.huber_loss(td_error)) regularizer = tf.add_n([tf.nn.l2_loss(var) for var in q_func_vars]) * 0.0001 total_error = weighted_error + regularizer # optimizer = tf.train.MomentumOptimizer( # learning_rate=lr_ph, momentum=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ lr_ph, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, weighted_error, total_error], updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def __init__(self, inputs: TrainInputs, action_space, observation_space): act_size = action_space.n optimizer = tf.train.AdamOptimizer(learning_rate=inputs.lr) with tf.variable_scope( 'q_func' ): # child scopes of reusable parent scope are reusable self.runner = q_policy(obs=inputs.s0, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope( 'q_func', reuse=True ): # child scopes of reusable parent scope are reusable q_net = q_policy(obs=inputs.s0, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope('target_q_func'): target_q_net = q_policy(obs=inputs.s1, epsilon=inputs.eps, action_space=action_space) update_target_op = tf.group(*[ tf.assign(a, b) for a, b in zip(target_q_net.trainables, q_net.trainables) ]) if G.double_q: with tf.variable_scope( 'q_func', reuse=True ): # child scopes of reusable parent scope are reusable inner_q_net = q_policy(obs=inputs.s1, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope('Q_training'): q_sampled = tf.reduce_sum(q_net.q_values * tf.one_hot(inputs.act, act_size), axis=1) if G.double_q: q_asterisk = tf.reduce_sum( target_q_net.q_values * tf.one_hot(inner_q_net.act_argmax, act_size), axis=1) else: q_asterisk = tf.reduce_max(target_q_net.q_values, axis=1) # compute RHS of bellman equation T_q = inputs.rew + (1.0 - inputs.done_mask_ph) * G.gamma * q_asterisk # compute the error (potentially clipped) td_error = q_sampled - tf.stop_gradient(T_q) _ = U.huber_loss(td_error) if G.prioritized_replay: loss = tf.reduce_mean(inputs.sample_weights * _) else: loss = tf.reduce_mean(_) # compute optimization op (potentially with gradient clipping) if G.grad_norm_clipping: optimize_op = U.minimize_and_clip( optimizer, loss, var_list=q_net.trainables, clip_val=G.grad_norm_clipping) else: optimize_op = optimizer.minimize(loss, var_list=q_net.trainables) def train(*, s0s, actions, rewards, s1s, dones, sample_weights=None): # read: SARSA feed_dict = { inputs.lr: G.learning_rate, inputs.s0: s0s, inputs.act: actions, inputs.rew: rewards, inputs.s1: s1s, inputs.done_mask_ph: dones } if G.prioritized_replay: assert sample_weights is not None, "sample_weights is required when prioritized_replay is ON." feed_dict[inputs.sample_weights] = sample_weights td_error_val, loss_val, _ = U.get_session().run( [td_error, loss, optimize_op], feed_dict) return td_error_val, loss_val def update_target(): U.get_session().run(update_target_op) self.train = train self.update_target = update_target