class Brain: def __init__(self, num_actions, Double, Dueling, PER): self.num_actions = num_actions # 행동 가짓수(2)를 구함 self.Double = Double self.Dueling = Dueling self.PER = PER # transition을 기억하기 위한 메모리 객체 생성 self.memory = ReplayMemory(CAPACITY) # 신경망 구성 n_out = num_actions self.main_q_network = Net_CNN(n_out, Dueling) # Net 클래스를 사용 self.target_q_network = Net_CNN(n_out, Dueling) # Net 클래스를 사용 print(self.main_q_network) # 신경망의 구조를 출력 # 최적화 기법을 선택 self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) # PER - TD 오차를 기억하기 위한 메모리 객체 생성 if self.PER == True: self.td_error_memory = TDerrorMemory(CAPACITY) def replay(self, episode=0): ''' Experience Replay로 신경망의 결합 가중치 학습 ''' # 1. 저장된 transition 수 확인 if len(self.memory) < BATCH_SIZE: return # 2. 미니배치 생성 if self.PER == True: self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch( episode) else: self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch( ) # 3. 정답신호로 사용할 Q(s_t, a_t)를 계산 self.expected_state_action_values = self.get_expected_state_action_values( ) # 4. 결합 가중치 수정 self.update_main_q_network() def decide_action(self, state, episode): '''현재 상태로부터 행동을 결정함''' # e-greedy 알고리즘에서 서서히 최적행동의 비중을 늘린다 epsilon = 0.5 * (1 / (episode + 1)) if epsilon <= np.random.uniform(0, 1): self.main_q_network.eval() # 신경망을 추론 모드로 전환 with torch.no_grad(): action = self.main_q_network(state).max(1)[1].view(1, 1) # 신경망 출력의 최댓값에 대한 인덱스 = max(1)[1] # .view(1,1)은 [torch.LongTensor of size 1]을 size 1*1로 변환하는 역할을 함 else: # 행동을 무작위로 반환 (0 혹은 1) action = torch.LongTensor([[random.randrange(self.num_actions)] ]) #행동을 무작위로 반환(0 혹은 1) # action은 [torch.LongTensor of size 1*1] 형태가 된다. return action def make_minibatch(self, episode=0): '''2. 미니배치 생성''' if self.PER == True: # 2.1 PER - 메모리 객체에서 미니배치를 추출 # def make_minibatch(self, episode): if episode < 30: transitions = self.memory.sample(BATCH_SIZE) else: # TD 오차를 이용해 미니배치를 추출하도록 수정 indexes = self.td_error_memory.get_prioritized_indexes( BATCH_SIZE) transitions = [self.memory.memory[n] for n in indexes] else: # 2.1 메모리 객체에서 미니배치를 추출 transitions = self.memory.sample(BATCH_SIZE) # 2.2 각 변수를 미니배치에 맞는 형태로 변형 # transitions는 각 단계별로 (state, action, state_next, reward) 형태로 BATCH_SIZE 개수만큼 저장됨 # 다시 말해, (state, action, state_next, reward) * BATCH_SIZE 형태가 된다. # 이를 미니배치로 만들기 위해 # (state*BATCH_SIZE, action*BATCH_SIZE), state_next*BATCH_SIZE, reward*BATCH_SIZE) # 형태로 변환한다. batch = Transition(*zip(*transitions)) # 2.3 각 변수의 요소를 미니배치에 맞게 변형하고, 신경망으로 다룰 수 있게 Variable로 만든다 # state를 예로 들면, [torch.FloatTensor of size 1*4] 형태의 요소가 BATCH_SIZE 개수만큼 있는 형태다 # 이를 torch.FloatTensor of size BATCH_SIZE*4 형태로 변형한다 # 상태, 행동, 보상, non_final 상태로 된 미니배치를 나타내는 Variable을 생성 # cat은 Concatenates(연접)를 의미한다. state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) return batch, state_batch, action_batch, reward_batch, non_final_next_states def get_expected_state_action_values(self): ''' 정답 신호로 사용할 Q(s_t,a_t)를 계산''' # 3.1 신경망을 추론 모드로 전환 self.main_q_network.eval() self.target_q_network.eval() # 3.2 신경망으로 Q(s_t, a_t)를 계산 # self.model(state_batch)은 왼쪽, 오른쪽에 대한 Q값을 출력하며 # [torch.FloatTensor of size BATCH_SIZEx2] 형태다 # 여기서부터는 실행한 행동 a_t에 대한 Q값을 계산하므로 action_batch에서 취한 행동 # a_t가 왼쪽이냐 오른쪽이냐에 대한 인덱스를 구하고, 이에 대한 Q값을 gather메서드로 모아온다. self.state_action_values = self.main_q_network( self.state_batch).gather(1, self.action_batch) # 3.3 max{Q(s_t+1, a)}값을 계산한다. 이때 다음 상태가 존재하는지에 주의해야 한다 # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듬 non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, self.batch.next_state))) # 먼저 전체를 0으로 초기화 next_state_values = torch.zeros(BATCH_SIZE) # Double DQN if self.Double == True: a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor) # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산 # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함 a_m[non_final_mask] = self.main_q_network( self.non_final_next_states).detach().max(1)[1] # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1로 변환 a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1) # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산 # detach() 메서드로 값을 꺼내옴 # squeeze()메서드로 size[minibatch*1]을 [minibatch]로 변환 next_state_values[non_final_mask] = self.target_q_network( self.non_final_next_states).gather( 1, a_m_non_final_next_states).detach().squeeze() else: # 다음 상태가 있는 인덱스에 대한 최대 Q값을 구한다 # 출력에 접근해서 열방향 최댓값(max(1))이 되는 [값, 인덱스]를 구한다 # 그리고 이 Q값(인덱스 = 0)을 출력한 다음 # detach 메서드로 이 값을 꺼내온다 next_state_values[non_final_mask] = self.target_q_network( self.non_final_next_states).max(1)[0].detach() # 3.4 정답신호로 사용할 Q(s_t, a_t) 값을 Q러닝 식으로 계산 expected_state_action_values = self.reward_batch + GAMMA * next_state_values return expected_state_action_values def update_main_q_network(self): ''' 4. 결합 가중치 수정 ''' # 4.1 신경망을 학습 모드로 전환 self.main_q_network.train() # 4.2 손실함수를 계산(smooth_l1_loss는 Huber 함수) # expected_state_action_values은 size가 [minibatch]이므로 unsqueeze해서 [minibatch*1]로 만듦 loss = F.smooth_l1_loss(self.state_action_values, self.expected_state_action_values.unsqueeze(1)) # 4.3 결합 가중치를 수정 self.optimizer.zero_grad() # 경사를 초기화 loss.backward() # 역전파 계산 self.optimizer.step() # 결합 가중치 수정 def update_target_q_network(self): # DDQN에서 추가됨 ''' Target Q-Network을 Main Q-Network와 맞춤 ''' self.target_q_network.load_state_dict(self.main_q_network.state_dict()) def update_td_error_memory(self): # Prioritized Experience Replay 에서 추가됨 ''' TD 오차 메모리에 저장된 TD 오차를 업데이트 ''' # 신경망을 추론 모드로 전환 self.main_q_network.eval() self.target_q_network.eval() # 전체 transition으로 미니배치를 생성 transitions = self.memory.memory batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) # 신경망의 출력 Q(s_t, a_t)를 계산 state_action_values = self.main_q_network(state_batch).gather( 1, action_batch) # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듦 non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # 먼저 전체를 0으로 초기화, 크기는 기억한 transition 개수만큼 next_state_values = torch.zeros(len(self.memory)) a_m = torch.zeros(len(self.memory)).type(torch.LongTensor) # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산 # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함 a_m[non_final_mask] = self.main_q_network( non_final_next_states).detach().max(1)[1] # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1 로 변환 a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1) # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산 # detach() 메서드로 값을 꺼내옴 # squeeze() 메서드로 size[minibatch*1]을 [minibatch]로 변환 next_state_values[non_final_mask] = self.target_q_network( non_final_next_states).gather( 1, a_m_non_final_next_states).detach().squeeze() # TD 오차를 계산 td_errors = (reward_batch + GAMMA * next_state_values) - state_action_values.squeeze() # state_action_values는 size[minibatch*1]이므로 squeeze 메서드로 size[minibatch]로 변환 # TD 오차 메모리를 업데이트. Tensor를 detach() 메서드로 꺼내와 NumPy 변수로 변환하고 # 다시 파이썬 리스트로 반환 self.td_error_memory.memory = td_errors.detach().numpy().tolist()
class Learner(): def __init__(self, sess, s_size, a_size, scope, queues, trainer): self.queue = queues[0] self.param_queue = queues[1] self.replaymemory = ReplayMemory(100000) self.sess = sess self.learner_net = network(s_size, a_size, scope, 20) self.q = self.learner_net.q self.Q = self.learner_net.Q self.actions_q = tf.placeholder(shape=[None, a_size, N], dtype=tf.float32) self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32) self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32) self.q_actiona = tf.multiply(self.q, self.actions_q) self.q_action = tf.reduce_sum(self.q_actiona, axis=1) self.u = tf.abs(self.q_target - self.q_action) self.loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1)) self.local_vars = self.learner_net.local_vars #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0) self.apply_grads = trainer.apply_gradients( zip(self.gradients, self.local_vars)) self.sess.run(tf.global_variables_initializer()) def run(self, gamma, s_size, a_size, batch_size, env): print('start learning') step, train1 = 0, False epi_q = [] self.env = env while True: if self.queue.empty(): pass else: while not self.queue.empty(): t_error = self.queue.get() step += 1 self.replaymemory.add(t_error) if self.param_queue.empty(): params = self.sess.run(self.local_vars) self.param_queue.put(params) if step >= 10000: train1 = True step = 0 if train1 == True: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) #print 'fadsfdasfadsfa' episode_buffer = np.array(episode_buffer) #print episode_buffer observations = episode_buffer[:, 0] actions = episode_buffer[:, 1] rewards = episode_buffer[:, 2] observations_next = episode_buffer[:, 3] dones = episode_buffer[:, 4] Q_target = self.sess.run(self.Q, feed_dict={ self.learner_net.inputs: np.vstack(observations_next) }) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = self.sess.run(self.q_action, feed_dict={ self.learner_net.inputs: np.vstack(observations_next), self.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] * (1 - dones[i]) + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.q_target: q_target_batch, self.learner_net.inputs: np.vstack(observations), self.actions_q: action_now, self.ISWeights: isweight } l, abs_errors, _ = self.sess.run( [self.loss, self.u, self.apply_grads], feed_dict=feed_dict) #print abs_errors abs_errors = np.mean(abs_errors, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, abs_errors)
class Agent: def __init__(self, policy_net, target_net, durability, optimizer, name, constants): """An agent class that takes action on the environment and optimizes the action based on the reward. Parameters ---------- policy_net : DQN [description] target_net : DQN [description] durability : int [description] optimizer : [type] [description] name : str The name of agent constants: Constants The hyper-parameters from Constants class """ self.CONSTANTS = constants self.policy_net = policy_net self.target_net = target_net self.target_net.load_state_dict(policy_net.state_dict()) self.durability = durability self.optimizer = optimizer self.name = name self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE) self.steps_done = 0 self.total_reward = 0.0 self.reward = 0.0 self.obtained_reward = 0.0 self.n_best = 0 self.policy_net_flag = False def select_action(self, state, is_first=False): sample = random.random() eps_threshold = self.CONSTANTS.EPS_END + (self.CONSTANTS.EPS_START - self.CONSTANTS.EPS_END) * \ math.exp(-1. * self.steps_done / self.CONSTANTS.EPS_DECAY) self.steps_done += 1 if is_first: self.writer.add_graph(self.policy_net, input_to_model=state.to( self.CONSTANTS.DEVICE), profile_with_cuda=True) if sample > eps_threshold: with torch.no_grad(): self.policy_net_flag = True return self.policy_net(state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.CONSTANTS.N_ACTIONS)]], device=self.CONSTANTS.DEVICE, dtype=torch.long) def select_core_action(self, best_agent_state, flag, best_agent_action): self.steps_done += 1 if flag: with torch.no_grad(): if best_agent_state is None: return self.policy_net(self.state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return self.policy_net( best_agent_state.to( self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1) else: return best_agent_action def optimize_model(self): if len(self.memory) < self.CONSTANTS.BATCH_SIZE: return transitions = self.memory.sample(self.CONSTANTS.BATCH_SIZE) # zip(*transitions) unzips the transitions into # Transition(*) creates new named tuple # batch.state - tuple of all the states (each state is a tensor) # batch.next_state - tuple of all the next states (each state is a tensor) # batch.reward - tuple of all the rewards (each reward is a float) # batch.action - tuple of all the actions (each action is an int) # Transition = ReplayMemory.get_transition() transition = self.CONSTANTS.TRANSITION batch = transition(*zip(*transitions)) actions = tuple( (map(lambda a: torch.tensor([[a]], device=self.CONSTANTS.DEVICE), batch.action))) rewards = tuple( (map(lambda r: torch.tensor([r], device=self.CONSTANTS.DEVICE), batch.reward))) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=utils.get_device(), dtype=torch.bool) non_final_next_states = torch.cat([ s for s in batch.next_state if s is not None ]).to(self.CONSTANTS.DEVICE) state_batch = torch.cat(batch.state).to(self.CONSTANTS.DEVICE) action_batch = torch.cat(actions) reward_batch = torch.cat(rewards) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.CONSTANTS.BATCH_SIZE, device=self.CONSTANTS.DEVICE) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.CONSTANTS.GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def set_tf_writer(self, path): self.writer = self._set_tf_writer(path) def _set_tf_writer(self, path): if self.name == "core": writer = SummaryWriter(log_dir="{}/tf-board/core/".format(path)) else: writer = SummaryWriter( log_dir="{}/tf-board/{}".format(path, self.name)) return writer def get_state(self): return self.state def get_next_state(self): return self.next_state def get_init_state(self): return self.init_state def get_name(self): return self.name def get_policy_net_flag(self): return self.policy_net_flag def set_init_state(self, state): self.init_state = state def set_state(self, state): self.state = state self.next_state = state def set_env(self, env): self.env = env def get_env(self): return self.env def set_action(self, action): self.action = action def get_action(self): return self.action def get_durability(self): return self.durability def get_policy_net(self): return self.policy_net def reduce_durability(self, value): self.durability = self.durability - value def heal_durability(self, value): self.durability = self.durability + value def set_done_state(self, done): self.done = done def set_total_reward(self, reward): self.reward = reward if reward > 0.0: self.obtained_reward += reward self.total_reward += reward def reset_total_reward(self): self.total_reward = 0.0 self.obtained_reward = 0.0 def get_reward(self): return self.reward def get_obtained_reward(self): return self.obtained_reward def best_counter(self): self.n_best += 1 def get_n_best(self): return self.n_best def get_total_reward(self): return self.total_reward def set_step_retrun_value(self, obs, done, info): self.obs = obs self.done = done self.info = info def is_done(self): return self.done
class Worker(): def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self, rollout, sess, gamma, ISWeights): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] dones = rollout[:, 4] Q_target = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={ self.local_Q.inputs: np.vstack(next_observations), self.local_Q.actions_q: action_next }) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i] # * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size, N)) for i in range(batch_size): for j in range(N): isweight[i, j] = ISWeights[i] feed_dict = { self.local_Q.inputs: np.vstack(observations), self.local_Q.actions_q: action_now, self.local_Q.q_target: q_target_batch, self.local_Q.ISWeights: isweight } u, l, g_n, v_n, _ = sess.run([ self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads ], feed_dict=feed_dict) return l / len(rollout), g_n, v_n, Q_target, u def work(self, gamma, sess, coord, saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run( self.local_Q.Q, feed_dict={self.local_Q.inputs: [s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s, a, r, s1, d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample( batch_size) l, g_n, v_n, Q_target, u = self.train( episode_buffer, sess, gamma, ISWeights) u = np.mean(u, axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx, u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save( sess, self.model_path + '/qr-dqn-' + str(episode_count) + '.cptk') print("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1
class Worker(): def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory) def train(self,rollout,sess,gamma,ISWeights): rollout = np.array(rollout) observations = rollout[:,0] actions = rollout[:,1] rewards = rollout[:,2] next_observations = rollout[:,3] dones = rollout[:,4] Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)}) actions_ = np.argmax(Q_target, axis=1) action = np.zeros((batch_size, a_size)) action_ = np.zeros((batch_size, a_size)) for i in range(batch_size): action[i][actions[i]] = 1 action_[i][actions_[i]] = 1 action_now = np.zeros((batch_size, a_size, N)) action_next = np.zeros((batch_size, a_size, N)) for i in range(batch_size): for j in range(a_size): for k in range(N): action_now[i][j][k] = action[i][j] action_next[i][j][k] = action_[i][j] q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations), self.local_Q.actions_q:action_next}) q_target_batch = [] for i in range(len(q_target)): qi = q_target[i]# * (1 - dones[i]) z_target_step = [] for j in range(len(qi)): z_target_step.append(gamma * qi[j] + rewards[i]) q_target_batch.append(z_target_step) q_target_batch = np.array(q_target_batch) #print q_target_batch isweight = np.zeros((batch_size,N)) for i in range(batch_size): for j in range(N): isweight[i,j] = ISWeights[i] feed_dict = {self.local_Q.inputs:np.vstack(observations), self.local_Q.actions_q:action_now, self.local_Q.q_target:q_target_batch, self.local_Q.ISWeights:isweight} u,l,g_n,v_n,_ = sess.run([self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms, self.local_Q.var_norms, self.local_Q.apply_grads],feed_dict=feed_dict) return l/len(rollout), g_n, v_n, Q_target, u def work(self,gamma,sess,coord,saver): global GLOBAL_STEP episode_count = sess.run(self.global_episodes) total_steps = 0 epsilon = 0.2 print ("Starting worker " + str(self.number)) best_mean_episode_reward = -float('inf') with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) #episode_buffer = [] episode_reward = 0 episode_step_count = 0 d = False s = self.env.reset() s = process_frame(s) if epsilon > 0.01: epsilon = epsilon * 0.997 while not d: #self.env.render() GLOBAL_STEP += 1 #Take an action using probabilities from policy network output. if random.random() > epsilon: a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]}) a_dist = a_dist_list[0] a = np.argmax(a_dist) else: a = random.randint(0, 5) s1, r, d, _ = self.env.step(a) if d == False: s1 = process_frame(s1) else: s1 = s self.replaymemory.add([s,a,r,s1,d]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 if total_steps % 2 == 0 and d != True and total_steps > 50000: episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size) l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights) u = np.mean(u,axis=1) + 1e-6 self.replaymemory.update_priorities(tree_idx,u) #sess.run(self.update_local_ops) if d == True: break sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory: if self.name == 'worker_0' and episode_count % 5 == 0: print('\n episode: ', episode_count, 'global_step:', \ GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \ 'epsilon: ', epsilon) print ('loss', l, 'Qtargetmean', np.mean(Q_target)) #print 'p_target', p_target if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000: saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk') print ("Saved Model") mean_reward = np.mean(self.episode_rewards[-100:]) if episode_count > 20 and best_mean_episode_reward < mean_reward: best_mean_episode_reward = mean_reward if self.name == 'worker_0': sess.run(self.increment) #if episode_count%1==0: #print('\r {} {}'.format(episode_count, episode_reward),end=' ') episode_count += 1
class DQNAgent(GymAgent): """ an agent for running the DQN algorithm (Minh et al 2013) """ def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None): super(DQNAgent, self).__init__(env, mode, tensorboard_writer) self.agent_name = 'DQN' + str(self.agent_no) self.memory = ReplayMemory() self.network = DeepQNetwork(self.obs_space[0], self.action_space) if self.mode == 'play': self.network.load_params(pre_trained_model) self.network.eval() elif self.mode == 'train': self.eval_network = DeepQNetwork(self.obs_space[0], self.action_space) self.eval_network.eval() if pre_trained_model: self.eval_network.load_params(pre_trained_model) self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR) self.loss_func = SmoothL1Loss() else: raise ValueError( 'Please set a valid mode for the agent (play or train)') def interact(self, state, action): """ returns: state, reward, done, info """ return self.env.step(action, state) def select_action(self, state): if self.mode == 'play': return self.network(prep_exploitation(state)).max(1)[1].view(1, 1) ##epsilon greedy policy eps_threshold = EPS_START * EPS_DECAY**self.no_training_steps if EPS_DECAY > EPS_END else EPS_END self.no_training_steps += 1 if random.random() > eps_threshold: with torch.no_grad(): return self.network(prep_exploitation(state)).max(1)[1].view( 1, 1) else: return prep_exploration(self.action_space) def optimize(self): sum_loss = 0 if len(self.memory) < BATCH_SIZE: batch_size = len(self.memory) else: batch_size = BATCH_SIZE s, a, _s, r = prep_mem_batch(self.memory.sample(batch_size)) non_final_next = torch.cat([sa for sa in _s if sa is not None]) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, _s))) state_action_values = self.network(s).gather(1, a.long().unsqueeze(1)) next_state_values = torch.zeros(batch_size) next_state_values[non_final_mask] = self.eval_network( non_final_next).detach().max(1)[0] expected_q = prep_q(next_state_values, r) loss = self.loss_func(state_action_values, expected_q.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def train(self, num_episodes, render=False, lr_decay=False): end_state = np.zeros(self.obs_space) state = end_state for episode in range(1, num_episodes + 1): done = False timesteps = 0 rewards = [] sum_rewards = [] loss = 0 times_alive = [] while not done: if state is end_state: state = self.env.initialize() if render: self.env.render() action = self.select_action(state) _state, reward, done, _ = self.interact(action.item(), state) rewards.append(reward) timesteps += 1 if done: _state = end_state sum_reward = np.sum(rewards) sum_rewards.append(sum_reward) mean_loss = loss / timesteps times_alive.append(timesteps) timesteps = 0 if self.writer: self.writer.add_scalar( self.agent_name + 'duration of episode', timesteps, episode) self.writer.add_scalar( self.agent_name + 'mean reward of episode', sum_reward, episode) self.writer.add_scalar( self.agent_name + 'mean loss of episode', mean_loss, episode) self.memory.push(state, action, _state if _state is not None else end_state, reward) state = _state episode_loss = self.optimize() loss += episode_loss if lr_decay: for g in self.optimizer.param_groups: g['lr'] = g['lr'] / (1 + (episode / LR_DECAY)) if episode % TARGET_UPDATE == 0: if self.env.goal(times_alive): print('goal reached your computer is smart :)') self.eval_network.save_params(self.agent_name, self.env.env_name) break else: times_alive = [] self.eval_network.update_params(self.network) print('episode ', episode, 'loss ', mean_loss, 'reward ', np.mean(sum_rewards)) #add your custom goals def play(self, num_episodes): for episode in range(1, num_episodes + 1): done = False state = self.env.initialize() while not done: self.env.render() action = self.select_action(state) _state, reward, done, _ = self.interact(action.item(), state) if done: state = self.env.initialize()
class Agent: '''Interact with and learn from the environment.''' def __init__(self, state_size, action_size, seed, is_double_q=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.running_loss = 0 self.training_cnt = 0 self.is_double_q = is_double_q self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, mode, epsilon=None): '''Returns actions for given state as per current policy. Params ====== state (array-like): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).float().unsqueeze(0).to( device) # shape of state (1, state) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if mode == 'test': action = np.argmax(action_values.cpu().data.numpy() ) # pull action values from gpu to local cpu elif mode == 'train': if random.random() <= epsilon: # random action action = random.choice(np.arange(self.action_size)) else: # greedy action action = np.argmax(action_values.cpu().data.numpy( )) # pull action values from gpu to local cpu return action def step(self, state, action, reward, next_state, done): # add new experience in memory self.replay_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.replay_memory) >= BUFFER_SIZE: experiences = self.replay_memory.sample(device) self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # compute and minimize the loss states, actions, rewards, next_states, dones = experiences q_local_chosen_action_values = self.qnetwork_local.forward( states).gather(1, actions) q_target_action_values = self.qnetowrk_target.forward( next_states).detach() # # detach from graph, don't backpropagate if self.is_double_q == True: q_local_next_actions = self.qnetwork_local.forward( next_states).detach().max(1)[1].unsqueeze( 1) # shape (batch_size, 1) q_target_best_action_values = q_target_action_values.gather( 1, q_local_next_actions) # Double DQN elif self.is_double_q == False: q_target_best_action_values = q_target_action_values.max( 1)[0].unsqueeze(1) # shape (batch_size, 1) q_target_values = rewards + gamma * q_target_best_action_values * ( 1 - dones) # zero value for terminal state td_errors = q_target_values - q_local_chosen_action_values loss = (td_errors**2).mean() self.running_loss += float(loss.cpu().data.numpy()) self.training_cnt += 1 self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.t_step % UPDATE_EVERY_STEP == 0: self.update(self.qnetwork_local, self.qnetowrk_target) def update(self, local_netowrk, target_network): """Hard update model parameters, as indicated in original paper. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for local_param, target_param in zip(local_netowrk.parameters(), target_network.parameters()): target_param.data.copy_(local_param.data)
writer.add_scalar('episode_return/episode', int(ep_return), int(ep)) break global_t += 1 ep_t += 1 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if global_t % N_TIMESTEPS_PER_UPDATE == 0 and len( replay_memory) > N_SAMPLES: # training loop # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for _ in range(N_EPOCHS): sample = replay_memory.sample(n_samples=N_SAMPLES, sample_length=SAMPLE_LENGTH) batch = episodes_to_batch(sample) values = value_net(batch.states) target_values = target_value_net(batch.states) shifted_values = torch.cat( (target_values[:, 1:], tensor(torch.zeros(target_values.shape[0], 1))), dim=-1) deltas = (-values + batch.rewards + GAMMA * shifted_values.detach()) advantages = tensor_forward_sum(deltas, GAMMA * LAMBDA) value_net_loss = (advantages**2).mean()