class DDPG: """ Deep Deterministic Policy Gradient Algorithm. Sourced By: https://github.com/stevenpjg/ddpg-aigym/blob/master/ddpg.py """ def __init__(self, num_states, num_actions, action_space_high, action_space_low, is_batch_norm): self.num_states = num_states self.num_actions = num_actions self.action_space_high = action_space_high self.action_space_low = action_space_low # Batch normalisation disabled. self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) # Replay Memory 초기화 self.replay_memory = deque() # time 초기화 self.time_step = 0 self.counter = 0 action_max = np.array(action_space_high).tolist() action_min = np.array(action_space_low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) # observation_1 = state at time t # observation 2 = state at time (t + 1) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward, self.done)) self.time_step = self.time_step + 1 # Replay memory 가 가득차면 맨 첫 번째 memory 를 삭제한다 if (len(self.replay_memory) > REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): # BATCH_SIZE 만큼 replay memory에서 가져온다. batch = random.sample(self.replay_memory, BATCH_SIZE) # S(t) 와 S(T + 1), action, reward, done 에 대한 batch를 # 각각 따로 저장한다 self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array(self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape( self.action_batch, [len(self.action_batch), self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): print "######## Starting to train..." # batch 뽑기 self.minibatches() # S(t + 1) 정보를 가지고 time (t + 1)에서의 action batch 생성 self.action_t_1_batch = self.actor_net.evaluate_target_actor( self.state_t_1_batch) # Q`(S(t + 1), a(t + 1)) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch, self.action_t_1_batch) print "#### Evaluated ciritic value(Q value)" print q_t_1 self.y_i_batch = [] # reward batch 의 item 을 가공하여 저장하는 곳 for i in range(0, BATCH_SIZE): # done == True 이면 terminal state로 간 것이므로 # 이 때의 reward 를 정답상태로 갔을 때의 reward 라고 할 수 있다 if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) # False 이면 terminal state 는 아니므로 reward에 (감마 * Q value) 값을 더한다 else: self.y_i_batch.append(self.reward_batch[i] + GAMMA * q_t_1[i][0]) self.y_i_batch = np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1]) # loss 를 최소화하여 critic network 를 업데이트 한다 # weight 을 업데이트 하는데 (y_i_batch - (state_t_batch, action_batch) 에서 예측한 y value) 가 최소가 되도록 한다 self.critic_net.train_critic(self.state_t_batch, self.action_batch, self.y_i_batch) # gradient 에 따라 actor 를 업데이트 한다 action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ) self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ)[0] # actor network 학습 self.actor_net.train_actor(self.state_t_batch, self.del_Q_a) # target critic, target actor network 업데이트 self.critic_net.update_target_critic() self.actor_net.update_target_actor() self.critic_net.save_critic("model/critic_model.ckpt") self.actor_net.save_actor("model/actor_model.ckpt") print "######## Finish to train ..."
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def add_data_fetch(self, df): self.data_fetch = df self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y', 'train_q', 'train_p', 'up_q_t', 'up_p_t'], prefix='t_agent_training_') self.data_fetch.add_array('actors_result') def get_name(self): return 'DDPG' + super().get_name() def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) self.data_fetch.add_to_array('actors_result', result) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) self.data_fetch.reset_timers() target_action = self.actor_net.evaluate_target_actor(state) self.data_fetch.sample_timer('ev_p_t') # ------ # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) self.data_fetch.sample_timer('ev_q_t') # ------ y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) self.data_fetch.sample_timer('y') # ------ # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) self.data_fetch.sample_timer('train_q') # ------ # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor(state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) self.data_fetch.sample_timer('train_p') # ------ # Update target Critic and actor network self.critic_net.update_target_critic() self.data_fetch.sample_timer('up_q_t') # ------ self.actor_net.update_target_actor() self.data_fetch.sample_timer('up_p_t') # ------
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self,env): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] #Initialize Actor Network: action_bound = env.action_space.high self.critic_net = CriticNet(self.num_states, self.num_actions) #self.actor_net is an object self.actor_net = ActorNet(self.num_states, self.num_actions, action_bound) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 #invert gradients (softthresholding) action_bounds = [[3], [-3]] #specify upper bound and lower bound of action space #action_bound structure for higher dimension actions[ #[max_of_action_dim_0, max_of_action_dim_1, ..., max_of_action_dim_10], #[min_of_action_dim_0, min_of_action_dim_1, ..., min_of_action_dim_10] #] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done)) self.time_step = self.time_step + 1 if(len(self.replay_memory)>REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array( self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) self.y_i_batch=[] for i in range(0,BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0]) self.y_i_batch=np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch) # Update actor proportional to the gradients: #actions for computing delQ/dela because action_for_delQ = self.evaluate_actor(self.state_t_batch) #think of if you want to take this action or the action_t_batch itself: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch,self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self, env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward, self.done)) self.time_step = self.time_step + 1 if (len(self.replay_memory) > REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array(self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape( self.action_batch, [len(self.action_batch), self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor( self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch, self.action_t_1_batch) self.y_i_batch = [] for i in range(0, BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA * q_t_1[i][0]) self.y_i_batch = np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch, self.y_i_batch) # Update actor proportional to the gradients: action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ) #/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ)[0] #/BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch, self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, dir, is_batch_norm=False, is_grad_inverter=True, training_flag=True): super().__init__(env, dir) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.training_flag = training_flag self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) self.data_fetch = None def add_data_fetch(self, df): self.data_fetch = df def get_short_name(self): return 'DDPG' def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) if self.data_fetch: self.data_fetch.set_actors_action(result[0].tolist()) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): if not self.training_flag: return # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) target_action = self.actor_net.evaluate_target_actor(state) # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor( state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor() def save_agent(self, force=False, comment="default"): path = "{}/weights/{}".format(self.get_dir(), comment) if not os.path.exists(path): os.makedirs(path, exist_ok=True) print("Saving agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: if force: print("Overwrite old agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: print("Save aborted. An agent is already saved in ", path) def load_agent(self, agent_name=None, comment="default"): if agent_name is None: path = "{}/weights/{}".format(self.get_dir(), comment) else: path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name, self.env.spec.id, comment) if os.path.exists(path): print("Loading agent saved in", path) self.actor_net.load_model(path + '/actor.ckpt') self.critic_net.load_model(path + '/critic.ckpt') else: print("Agent not found in", path) def close_session(self): self.actor_net.close() self.critic_net.close()
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) # EREZ ADDED def save(self, path): # TODO -- robust handling of where to put things # Everything from the super class can be pickled easily attrs = Container() saved_critic = self.critic_net.save(path) saved_actor = self.actor_net.save(path) i_vars = vars(self) keys = i_vars.keys() for key in keys: tmp = getattr(self, key) if not (isinstance(tmp, (CriticNet, ActorNet, grad_inverter))): setattr(attrs, key, tmp) file = os.path.join( path, "agent_data.pkl") # TODO -- come up with a not stupid name with open(file, "wb") as f: pickle.dump(attrs, f, pickle.HIGHEST_PROTOCOL) # EREZ ADDED # def restore(self, file): -- write now its just in the subclass def restore(self, path): print("restoring the agent") file = os.path.join( path, "agent_data.pkl") # TODO -- come up with a not stupid name with open(file, "rb") as f: dump = pickle.load(f) i_vars = vars(dump) keys = i_vars.keys() for key in keys: tmp = getattr(dump, key) setattr(self, key, tmp) action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) # Now replace the networks # IGNORE THE "IS BATCH " CONDITION FOR NOW saved_critic_net = CriticNet(self.observation_space_size, self.action_space_size) saved_actor_net = ActorNet(self.observation_space_size, self.action_space_size) # Load in the saved graphs critic_file = os.path.join(path, "critic_net.ckpt") saved_critic_net.restore(critic_file) actor_file = os.path.join(path, "actor_net.ckpt") saved_actor_net.restore(actor_file) self.critic_net = saved_critic_net self.actor_net = saved_actor_net def add_data_fetch(self, df): self.data_fetch = df def get_name(self): return 'DDPG' + super().get_name() def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) self.data_fetch.set_actors_action(result[0].tolist()) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) target_action = self.actor_net.evaluate_target_actor(state) # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor( state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self,env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done)) self.time_step = self.time_step + 1 if(len(self.replay_memory)>REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array( self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) self.y_i_batch=[] for i in range(0,BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0]) self.y_i_batch=np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch) # Update actor proportional to the gradients: action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch,self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()