def q_train_ddpg(observPlaceHolderList, actionSpaceList, q_index, q_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList] # set up placeholders act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))] target_ph = tf.placeholder(tf.float32, [None], name="target") # q_input = tf.concat(observPlaceHolderList + act_ph_n, 1) q_input = tf.concat([observPlaceHolderList[q_index], act_ph_n[q_index]], 1) # specific for ddpg q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=observPlaceHolderList + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(observPlaceHolderList + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(observPlaceHolderList + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # SoftCategoricalPdType Object # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] # 0th element: name = 'agent_0_1/action0:0', shape = (?, 5) p_input = obs_ph_n[p_index] # mlp_model(tensor(,12), 5, scope="p_func", num_units=64) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) # SoftCategoricalPd Object act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) # shape = 12+12+10+5*3 if local_q_func: # ddpg, uses only personal obs/ act q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) # shape = 17 = 12+ 5 q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def p_train(observPlaceHolderList, actionSpaceList, agentIndex, getMLPModel, q_func, optimizer, grad_norm_clipping=None, ddpg=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList] # set up placeholders act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))] p_input = observPlaceHolderList[agentIndex] p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="getMLPModel", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("getMLPModel")) # wrap parameters in distribution act_pd = actionPlaceHolderTypeList[agentIndex].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[agentIndex] = act_pd.sample() q_input = tf.concat(observPlaceHolderList + act_input_n, 1) if ddpg: q_input = tf.concat([observPlaceHolderList[agentIndex], act_input_n[agentIndex]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=observPlaceHolderList + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=act_sample) p_values = U.function([observPlaceHolderList[agentIndex]], p) # target network target_p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = actionPlaceHolderTypeList[agentIndex].pdfromflat(target_p).sample() target_act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp") # local_q_func = False if maddpgAlgor, = true if ddpg # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def p_train(observPlaceHolderList, actionSpaceList, agentIndex, p_func, q_func, optimizer, grad_norm_clipping, ddpg, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # set up placeholders actionPlaceHolderList = [ tf.placeholder(dtype=tf.float32, shape=[None] + [actionSpaceList[i].n], name="action" + str(i)) for i in range(len(actionSpaceList)) ] policyNetInput = observPlaceHolderList[ agentIndex] # personal observation policyOutputShape = int(actionSpaceList[agentIndex].n) policyTrainOutput = p_func(policyNetInput, policyOutputShape, scope="p_func", num_units=num_units) policyNetVariables = U.scope_vars(U.absolute_scope_name("p_func")) sampleNoise = tf.random_uniform(tf.shape(policyTrainOutput), seed=0) actionSample = U.softmax(policyTrainOutput - tf.log(-tf.log(sampleNoise)), axis=-1) # output of function act p_reg = tf.reduce_mean(tf.square(policyTrainOutput)) actionInputPlaceHolderList = actionPlaceHolderList + [] actionInputPlaceHolderList[agentIndex] = actionSample qNetInput = tf.concat( observPlaceHolderList + actionInputPlaceHolderList, 1) if ddpg: qNetInput = tf.concat( [observPlaceHolderList[agentIndex], actionSample], 1) q = q_func(qNetInput, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 ####### didnt change this optimization process in my ddpg optimize_expr = U.minimize_and_clip(optimizer, loss, policyNetVariables, grad_norm_clipping) # Create callable functions train = U.function(inputs=observPlaceHolderList + actionPlaceHolderList, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=actionSample) p_values = U.function([observPlaceHolderList[agentIndex]], policyTrainOutput) # target network target_p = p_func(policyNetInput, int(actionSpaceList[agentIndex].n), scope="target_p_func", num_units=num_units) targetNetVariables = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(policyNetVariables, targetNetVariables) uTarget = tf.random_uniform(tf.shape(target_p)) target_act_sample = U.softmax(target_p - tf.log(-tf.log(uTarget)), axis=-1) target_act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(observPlaceHolderList, actionSpaceList, agentIndex, q_func, optimizer, grad_norm_clipping=None, ddpg=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): actionPlaceHolderList = [ tf.placeholder(dtype=tf.float32, shape=[None] + [actionSpaceList[i].n], name="action" + str(i)) for i in range(len(actionSpaceList)) ] yi_ = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(observPlaceHolderList + actionPlaceHolderList, 1) # shape (?, 24) if ddpg: q_input = tf.concat([ observPlaceHolderList[agentIndex], actionPlaceHolderList[agentIndex] ], 1) # shape (?, 13) q = q_func( q_input, 1, scope="q_func", num_units=num_units)[:, 0] # drop a level: shape (?, 1) to shape (?,) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) loss = tf.reduce_mean(tf.square(q - yi_)) optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=observPlaceHolderList + actionPlaceHolderList + [yi_], outputs=loss, updates=[optimize_expr]) q_values = U.function(observPlaceHolderList + actionPlaceHolderList, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function( observPlaceHolderList + actionPlaceHolderList, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }