def q_train_group(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action_group" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target_group") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func_group", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func_group")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func_group", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func_group")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values_group': q_values, 'target_q_values_group': target_q_values }
def p_train_attention(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action_attention" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func_attention", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func_attention")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func_attention", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func_attention", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func_attention")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values_attention': p_values, 'target_act_attention': target_act }
def q_train(make_obs_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): num_agents = len(make_obs_ph_n) # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in range(len(act_space_n)) ] target_ph_n = [ tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents) ] is_norm_training = tf.placeholder(tf.bool) is_inference = tf.placeholder(tf.bool) q_input = tf.concat(obs_ph_n + act_ph_n, 1) q_n = [ q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents) ] q_func_vars = [ U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents) ] q_loss_n = [ tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n) ] # viscosity solution to Bellman differential equation in place of an initial condition # q_reg = tf.reduce_mean(tf.square(q)) q_loss = tf.reduce_sum(q_loss_n) loss = q_loss # + 1e-3 * q_reg var_list = list(itertools.chain(*q_func_vars)) optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + target_ph_n + [is_norm_training, is_inference], outputs=loss, updates=[optimize_expr]) q_values = U.function( obs_ph_n + act_ph_n + [is_norm_training, is_inference], q_n) # target network target_q_n = [ q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents) ] target_q_func_vars = [ U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in range(num_agents) ] traget_var_list = list(itertools.chain(*target_q_func_vars)) update_target_q = make_update_exp(var_list, traget_var_list) target_q_values = U.function( obs_ph_n + act_ph_n + [is_norm_training, is_inference], target_q_n) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train_function(self, make_obs_ph_n, act_space_n, before_com_func, channel, after_com_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.05, ibmac_com=True): with tf.variable_scope(scope, reuse=reuse): clip_threshold = 1 # 1, 5, 10 is_norm_training = tf.placeholder(tf.bool) is_inference = tf.placeholder(tf.bool) ibmac_nocom = not ibmac_com num_agents = len(make_obs_ph_n) # create distribtuions act_pdtype_n = [ make_pdtype(act_space) for act_space in act_space_n ] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents) ] hiddens_n = [ before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), num_units=num_units) for i in range(num_agents) ] before_com_vars_n = [ U.scope_vars(U.absolute_scope_name("before_com_{}".format(i))) for i in range(num_agents) ] hiddens_n_for_message = tf.concat([ before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), reuse=True, num_units=num_units) for i in range(num_agents) ], axis=1) hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message) channel_output = channel(hiddens_n_for_message, num_units * num_agents, scope="channel", num_units=num_units * num_agents) message_n, mu_message_n, logvar_message_n = [ tf.split(item, num_or_size_splits=num_agents, axis=1) for item in channel_output ] logvar_message_n = [ tf.clip_by_value(log, -10, 10) for log in logvar_message_n ] # constrain kl_loss not to be too large message_n = [ clip_message(message, clip_threshold, is_norm_training, is_inference) for message in message_n ] channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))] if ibmac_nocom: print('no_com') p_n = [ after_com_func(hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] else: check_n = [ hiddens_n[i] + message_n[i] for i in range(num_agents) ] p_n = [ after_com_func(hiddens_n[i] + message_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] p_func_vars = [ U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents) ] # wrap parameters in distribution act_pd_n = [ act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents) ] act_sample_n = [act_pd.sample() for act_pd in act_pd_n] p_reg_n = [ tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n ] act_input_n_n = [act_ph_n + [] for _ in range(num_agents)] for i in range(num_agents): act_input_n_n[i][i] = act_pd_n[i].sample() q_input_n = [ tf.concat(obs_ph_n + act_input_n, 1) for act_input_n in act_input_n_n ] q_n = [ q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in range(num_agents) ] pg_loss_n = [-tf.reduce_mean(q) for q in q_n] # # 0.25 =bandwidth # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] # #1 # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] # #5 # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] # 10 # kl_loss_message_n = [1.0/200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(10) - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] ##bw=1 b1+b2 = 1, alpha = bw kl_loss_message_n = [ 1 / 2 * 1 / (tf.pow(self.alpha, 2)) * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - np.log(self.alpha) - 0.5 for mu, log in zip(mu_message_n, logvar_message_n) ] entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n] pg_loss = tf.reduce_sum(pg_loss_n) p_reg = tf.reduce_sum(p_reg_n) kl_loss_message = tf.reduce_mean(kl_loss_message_n) if ibmac_nocom: loss = pg_loss + p_reg * 1e-3 else: loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message kl_loss = U.function(inputs=obs_ph_n + act_ph_n + [is_norm_training, is_inference], outputs=kl_loss_message) var_list = [] var_list.extend(before_com_vars_n) if not ibmac_nocom: var_list.extend(channel_vars_n) var_list.extend(p_func_vars) var_list = list(itertools.chain(*var_list)) optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [is_norm_training, is_inference], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=act_sample_n) p_values = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=p_n) if not ibmac_nocom: check_values = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=check_n) channel_com = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=channel_output) check_mu = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=mu_message_n) check_log = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=logvar_message_n) else: check_values = lambda x: 0 channel_com = lambda x: 0 check_mu = lambda x: 0 check_log = lambda x: 0 # target network target_hiddens_n = [ before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), num_units=num_units) for i in range(num_agents) ] target_before_com_vars = [ U.scope_vars( U.absolute_scope_name("target_before_com_{}".format(i))) for i in range(num_agents) ] target_hiddens_n_for_message = tf.concat([ before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), reuse=True, num_units=num_units) for i in range(num_agents) ], axis=1) target_hiddens_n_for_message = tf.stop_gradient( target_hiddens_n_for_message) target_channel_output = channel(target_hiddens_n_for_message, num_units * num_agents, scope="target_channel", num_units=num_units * num_agents) target_message_n, target_mu_message_n, target_logvar_message_n = [ tf.split(item, num_or_size_splits=num_agents, axis=1) for item in target_channel_output ] target_channel_vars = [ U.scope_vars(U.absolute_scope_name("target_channel")) ] if ibmac_nocom: target_p_n = [ after_com_func(target_hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] else: target_p_n = [ after_com_func(target_hiddens_n[i] + target_message_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] target_p_func_vars = [ U.scope_vars( U.absolute_scope_name("target_p_func_{}".format(i))) for i in range(num_agents) ] target_var_list = [] target_var_list.extend(target_before_com_vars) if not ibmac_nocom: target_var_list.extend(target_channel_vars) target_var_list.extend(target_p_func_vars) target_var_list = list(itertools.chain(*target_var_list)) update_target_p = make_update_exp(var_list, target_var_list) target_act_sample_n = [ act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents) ] target_act = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=target_act_sample_n) check_message_n = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=message_n) check_hiddens_n = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=hiddens_n) check_entropy = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=entropy) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act, 'kl_loss': kl_loss, 'check_values': check_values, 'channel_com': channel_com, 'check_mu': check_mu, 'check_log': check_log, 'check_message_n': check_message_n, 'check_hiddens_n': check_hiddens_n, 'check_entropy': check_entropy }