class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self, observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) / BATCH_SIZE self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.exploration_noise.noise(), self.environment.action_space.low, self.environment.action_space.high) def set_feedback(self, observation, action, reward, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append( (self.state, action, reward, next_state, done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class ActorCriticNet: def __init__(self, input_dim, action_dim, critic_layers, actor_layers, actor_activation, scope='ac_network'): self.input_dim = input_dim self.action_dim = action_dim self.scope = scope self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x') self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y') with tf.variable_scope(scope): self.actor_network = ActorNetwork(self.x, action_dim, hidden_layers=actor_layers, activation=actor_activation) self.critic_network = CriticNetwork( self.x, self.actor_network.get_output_layer(), hidden_layers=critic_layers) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._build() def _build(self): value = self.critic_network.get_output_layer() actor_loss = -tf.reduce_mean(value) self.actor_vars = self.actor_network.get_params() self.actor_grad = tf.gradients(actor_loss, self.actor_vars) tf.summary.scalar("actor_loss", actor_loss, collections=['actor']) self.actor_summary = tf.summary.merge_all('actor') critic_loss = 0.5 * tf.reduce_mean(tf.square((value - self.y))) self.critic_vars = self.critic_network.get_params() self.critic_grad = tf.gradients(critic_loss, self.critic_vars) tf.summary.scalar("critic_loss", critic_loss, collections=['critic']) self.critic_summary = tf.summary.merge_all('critic') def get_action(self, sess, state): return self.actor_network.get_action(sess, state) def get_value(self, sess, state): return self.critic_network.get_value(sess, state) def get_action_value(self, sess, state, action): return self.critic_network.get_action_value(sess, state, action) def get_actor_feed_dict(self, state): return {self.x: state} def get_critic_feed_dict(self, state, action, target): return { self.x: state, self.y: target, self.critic_network.input_action: action } def get_clone_op(self, network, tau=0.9): update_ops = [] new_vars = {v.name.replace(network.scope, ''): v for v in network.vars} for v in self.vars: u = (1 - tau) * v + tau * new_vars[v.name.replace(self.scope, '')] update_ops.append(tf.assign(v, u)) return update_ops