class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, dir, is_batch_norm=False, is_grad_inverter=True, training_flag=True): super().__init__(env, dir) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.training_flag = training_flag self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) self.data_fetch = None def add_data_fetch(self, df): self.data_fetch = df def get_short_name(self): return 'DDPG' def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) if self.data_fetch: self.data_fetch.set_actors_action(result[0].tolist()) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): if not self.training_flag: return # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) target_action = self.actor_net.evaluate_target_actor(state) # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor( state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor() def save_agent(self, force=False, comment="default"): path = "{}/weights/{}".format(self.get_dir(), comment) if not os.path.exists(path): os.makedirs(path, exist_ok=True) print("Saving agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: if force: print("Overwrite old agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: print("Save aborted. An agent is already saved in ", path) def load_agent(self, agent_name=None, comment="default"): if agent_name is None: path = "{}/weights/{}".format(self.get_dir(), comment) else: path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name, self.env.spec.id, comment) if os.path.exists(path): print("Loading agent saved in", path) self.actor_net.load_model(path + '/actor.ckpt') self.critic_net.load_model(path + '/critic.ckpt') else: print("Agent not found in", path) def close_session(self): self.actor_net.close() self.critic_net.close()
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self,env, is_batch_norm=False): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds) def load_model(self): try: self.critic_net.load_model() self.actor_net.load_model() except Exception as ex: print(ex) def save_model(self): self.critic_net.save_model() self.actor_net.save_model() def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done)) self.time_step = self.time_step + 1 if(len(self.replay_memory)>REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array( self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) self.y_i_batch=[] for i in range(0,BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0]) self.y_i_batch=np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch) # Update actor proportional to the gradients: action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch,self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()