def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))] act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) sync_target_p = make_update_exp(p_func_vars, target_p_func_vars, rate=1.0) target_act_pd = act_pdtype_n[p_index].pdfromflat(target_p) target_act_sample = target_act_pd.sample() target_act_mode = target_act_pd.mode() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) target_mode = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_mode) target_p_values = U.function([obs_ph_n[p_index]], target_p) return act, train, update_target_p, sync_target_p, {'p_values': p_values, 'target_p_values': target_p_values, 'target_mode': target_mode, 'target_act': target_act}
def q_train(make_obs_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n #[U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) q = q_func(q_input, 1, scope="q_func")[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # q_loss = tf.reduce_mean(U.huber_loss(q - target_ph)) # TEMP: just want to give an viscosity solution to Bellman differential equation # in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func")[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }