def __init__(self, name, learning_rate=0.001): with tf.variable_scope(name, 'connection'): self.name = name self.ind_inp = tf.placeholder(tf.float32, [None, 1], name='ind_inp') self.soc_inp = tf.placeholder(tf.float32, [None, 1], name='soc_inp') self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward') self.discount_ = tf.placeholder(tf.float32, [None, 1], name='discount') self.bootstrap_ = tf.placeholder(tf.float32, [None], name='bootstrap') self.concat = tf.concat([self.ind_inp, self.soc_inp], 1) self.con_layer = tf.contrib.layers.fully_connected( self.concat, 1, activation_fn=None) self.con_return, self.advantage = trfl.sequence_advantage_critic_loss( self.con_layer, self.reward_, self.discount_, self.bootstrap_, lambda_=lambda_, baseline_cost=1) self.critic_loss = tf.reduce_mean(self.con_return.loss) self.critic_optim = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.critic_loss)
def __init__(self, name, socialism_learning_rate=0.01): with tf.variable_scope(name): self.name=name self.individual_input_ = tf.placeholder(tf.float32, [None, 1], name='i_input') self.social_input_ = tf.placeholder(tf.float32, [None, 1], name='s_input') self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward') self.discount_ = tf.placeholder(tf.float32, [None, 1], name='discount') self.bootstrap_ = tf.placeholder(tf.float32, [None], name='bootstrap') self.input_reshape = tf.reshape([self.individual_input, self.social_input], [-1, 2]) self.combined_baseline_ = tf.contrib.layers.fully_connected(self.input_reshape, 1, activation_fn=None) self.combined_return, _ = trfl.sequence_advantage_critic_loss(self.combined_baseline_, self.reward_, self.discount_, self.bootstrap_, lambda_=lambda_, baseline_cost=baseline_cost) self.combined_loss_ = tf.reduce_mean(self.combined_return.loss) self.combined_optim = tf.train.AdamOptimizer(learning_rate=socialism_learning_rate).minimize(self.combined_loss_)
def __init__(self, name, critic_hidden_size=32, critic_learning_rate=0.0001): with tf.variable_scope(name, "Critic"): # define inputs for critic networks self.name = name self.input_ = tf.placeholder(tf.float32, [None, obs_size], name='inputs') self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward') self.discount_ = tf.placeholder(tf.float32, [None, 1], name='discount') self.bootstrap_ = tf.placeholder(tf.float32, [None], name='bootstrap') # set up critic network (hidden layers) self.fc1_critic_ = tf.contrib.layers.fully_connected( self.input_, critic_hidden_size, activation_fn=tf.nn.elu) self.fc2_critic_ = tf.contrib.layers.fully_connected( self.fc1_critic_, critic_hidden_size, activation_fn=tf.nn.elu) self.fc3_critic_ = tf.contrib.layers.fully_connected( self.fc2_critic_, critic_hidden_size, activation_fn=tf.nn.elu) # set up critic network (output layer) self.baseline_ = tf.contrib.layers.fully_connected( self.fc3_critic_, 1, activation_fn=None) # get critic loss self.Critic_return, self.advantage = trfl.sequence_advantage_critic_loss( self.baseline_, self.reward_, self.discount_, self.bootstrap_, lambda_=lambda_, baseline_cost=baseline_cost) # Optimize the loss self.critic_loss_ = tf.reduce_mean(self.Critic_return.loss) self.critic_optim = tf.train.AdamOptimizer( learning_rate=critic_learning_rate).minimize(self.critic_loss_)