class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync evac target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # loss function self.loss_function = nn.MSELoss() # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy策略 if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net(x) # (N_ENVS, N_ACTIONS, N_QUANT) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) # b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda() # action value for current state q_eval = self.pred_net(b_s) mb_size = q_eval.size(0) q_eval = torch.stack([q_eval[i][b_a[i]] for i in range(mb_size)]) # optimal action value for current state q_next = self.target_net(b_s_) # best_actions = q_next.argmax(dim=1) # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)]) q_next = torch.max(q_next, -1)[0] q_target = b_r + GAMMA * (1. - b_d) * q_next q_target = q_target.detach() # loss loss = self.loss_function(q_eval, q_target) logger.store(loss=loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
class Agent(): def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=512, gamma=0.99, learning_rate=1e4, target_tau=1e3, update_rate=100, seed=0): #affect your agent vs other agents self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def load_model(self, path_model, path_target=None): params = torch.load(path_model) self.network.set_params(params) self.network.load_state_dict(torch.load(path_model)) if path_target != None: self.target_network.load_state_dict(torch.load(path_target)) def model_step(self, state, action, reward, next_state): # save experience in replay memory self.memory.add(state, action, reward, next_state) # learn every UPDATE_EVERY time steps self.t_step = self.t_step + 1 if self.t_step % self.update_rate == 0: # if enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def choose_action(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy() ) # return a number from 0 to action_size else: return random.choice(np.arange( self.action_size)) # return a number from 0 to action_size def learn(self, experiences, gamma, stp): states, actions, rewards, next_states = experiences # Get Q values from current observations (s,a) using model network # get max Q values for (s', a') from target model self.network.train() Q_sa = self.network(states).gather(1, actions) #print(Q_sa) Q_sa_prime_target_values = self.target_network(next_states).max( 1)[0].to(device).float().detach() #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1) #print(Q_sa_prime_target_values) # compute Q targets for current states #print(rewards) Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1) #print(Q_sa_targets) #input('train') #Q_sa_targets = Q_sa_targets.unsqueeze(1) # Compute loss (error) criterion = torch.nn.MSELoss(reduction='sum') loss = criterion( Q_sa.to(device), Q_sa_targets.to(device)) #F.mse_loss(Q_sa, Q_sa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network if stp % 100 == 0: print('Updating Model') self.soft_update(self.network, self.target_network, self.tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def Read(self): decision_steps, terminal_steps = env.get_steps(self.behavior_name) try: signal_front = np.array( sensor_front_sig( decision_steps.obs[0][self.index_player, :])) # 3 x 11 x 8 signal_back = np.array( sensor_back_sig( decision_steps.obs[1][self.index_player, :])) # 3 x 3 x 8 #pre_state = [] signal_front = np.array(signal_front) #print(signal_front.shape) #print(signal_back.shape) r = np.concatenate((signal_front, signal_back), axis=1) #print(r.shape) #input('ff') #pre_state.extend(list(np.array(signal_front).flatten())) #pre_state.extend(list(np.array(signal_back).flatten())) #state = np.array(pre_state) self.current_state = r count_close_to_ball = 0 count_touch_ball = 0 count_back_touch = 0 count_back_close = 0 self.rew_d_to_our_post = 0 self.rew_for_ball_dist = -0.1 # Front Observation for i in range(len(signal_front[0])): if signal_front[0][i][0] == 1.0: count_close_to_ball += 1 self.rew_for_ball_dist = max( 0.3 * (1 - signal_front[0][i][7]), self.rew_for_ball_dist) # Kicked the ball at the front if signal_front[0][i][7] <= 0.03: count_touch_ball += 1 if signal_front[0][i][1] == 1.0: self.rew_d_to_our_post = -0.1 if signal_front[0][i][2] == 1.0: self.rew_d_to_our_post = 0.1 # Back observation for i in range(len(signal_back[0])): if signal_back[0][i][0] == 1.0: count_back_close += 0.2 # Touches the ball at the back if signal_back[0][i][7] <= 0.03: count_back_touch += 0.3 self.back_touch = 1 if count_back_touch > 0 else 0.2 self.back_close = 1 if count_back_close > 0 else 0.1 # add reward if kick the ball self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15 # Penalize for back touching the ball if count_back_touch > 0: self.touch_ball_reward = -0.25 # Penalize if the ball is not in view self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05 # Penalize if the ball is behind the agent if count_back_close > 0: self.close_ball_reward = -0.1 return self.current_state except: self.touch_ball_reward = 0 self.close_ball_reward = 0 return self.current_state def upd_after_goal(self, n_upds): self.memory.upd_goal(n_upds) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def we_goll(self): self.memory.we_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def us_goll(self): self.memory.us_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step)
def train(sess, env, actor, critic, noise, reward, discrete, saver, checkpoint_path): # Set up summary writer summary_writer = tf.summary.FileWriter("ddpg_summary") actor.update() critic.update() replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # Initialize noise ou_level = 0. for i in range(MAX_EPISODES): if i % 100 == 0: saver.save(sess, checkpoint_path) # s 는 environment에서 제공하는 첫번째 state 정보. s = env.reset() ep_reward = 0 ep_ave_max_q = 0 # buffer 초기화 episode_buffer = np.empty((0, 5), float) for j in range(MAX_EP_STEPS): # print(critic.w1.eval()[0,0]) env.render() # a 는 actor의 current policy를 기반으로 예측한 q_value tensor [None x action_dim] a = actor.predict(np.reshape(s, (1, actor.state_dim))) # stochastic environment에서의 e-greedy 를 위해서 # Noise 를 추가한다. # 아랫 부분은 ornstein_uhlenbeck_level이라는 내용을 참조해야 합니다.. constant action space에서 # 학습을 위해서 사용하는 방법이라고 합니다. if i < NOISE_MAX_EP: ou_level = noise.ornstein_uhlenbeck_level(ou_level) a = a + ou_level # Set action for discrete and continuous action spaces if discrete: action = np.argmax(a) else: action = a[0] # 선택된 action을 기반으로 step을 진행시킨 후 결과를 # 돌려받습니다. s2, r, terminal, info = env.step(action) # episode 내의 총 reward 값을 더합니다. ep_reward += r # ==========================================================================[중요한 부분]============== # Replay Buffer에 해당 정보를 더합니다. # episode_buffer라는 nparray에 [s, a, r, terminal, s2]의 배열을 넣어줍니다. episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) # =================================================================================================== # Replay Buffer에 Minibatch size 이상의 데이터가 담겨 있다면 # 데이터를 가져와서 학습을 진행합니다. if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # critic을 통해서 Q(s, a)인 action-value function을 가져옵니다. # 이때 critic은 current policy를 평가하고 action-value function을 학습하므로, # actor의 예측 값을 가져옵니다. target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Critic이 가진 action_value function을 학습합니다. # 이때 필요한 데이터는 state batch, action batch, reward batch 입니다. # reward는 DQN (deepmind etc...) 논문에서 사용했던 것 과 같이 # terminal 이라면 마지막 reward 자체. # terminal이 아니라면, s2 에서의 q_value 값에 discount factor를 곱한 값과 s 에서의 reward를 더한 값을 # reward로 계사합니다. predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) # 모델의 성능 측정을 위해서 average Q 값을 확인합니다. # DQN (deepmind etc..) 논문에서 강화학습 모델의 진행도를 측정하기 위한 좋은 # 지표로서 언급하였습니다. ep_ave_max_q += np.amax(predicted_q_value) # replay buffer에서 가져온 state_batch를 사용해서 actor의 current policy에 해당하는 # q_value를 가져옵니다. # current_policy에 따른 q_value를 state 정보와 함께 넣어 # Q(s, a)를 계산합니다 by critic a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) # grads 는 (1, BATCH_SIZE, ACTION_DIM) 의 배열이므로 # (BATCH_SIZE, ACTION_DIM)의 입력을 받는 actor.train 함수를 위해서 grads[0]을 취합니다. # actor의 policy를 업데이트 하기 위해서 critic의 gradients를 받아와서 train합니다. actor.train(s_batch, grads[0]) # actor와 critic network 모두 업데이트 합니다. actor.update() critic.update() # s2 를 s 로 바꾸어 진행합니다. s = s2 if terminal: episode_buffer = reward.discount(episode_buffer) # Add episode to replay buffer for step in episode_buffer: replay_buffer.add( np.reshape(step[0], (actor.state_dim, )), np.reshape(step[1], (actor.action_dim, )), step[2], step[3], np.reshape(step[4], (actor.state_dim, ))) summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=float(ep_reward)) summary.value.add(tag='Perf/Qmax', simple_value=float(ep_ave_max_q / float(j))) summary_writer.add_summary(summary, i) summary_writer.flush() print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) break
# Number of updates per step in environment for i in range(config['updates_per_step']): # Update parameters of all the networks agent.train(memory, config['batch_size']) updates += 1 next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 episode_reward += reward # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) done_bool = float( done) if episode_steps < env._max_episode_steps else 0 memory.add(state, action, next_state, reward, done_bool) # Append transition to memory state = next_state if total_numsteps > config['num_steps']: break # writer.add_scalar('reward/train', episode_reward, i_episode) print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}". format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2))) if total_numsteps > test_step and config['eval'] == True: test(env) test_step += 10000
class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync evac target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) # Update target network def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy if np.random.uniform() >= EPSILON: # greedy case action_value, tau = self.pred_net( x) # (N_ENVS, N_ACTIONS, N_QUANT) action_value = action_value.mean(dim=2) action = torch.argmax(action_value, dim=1).data.cpu().numpy() # print(action) else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) print(b_d) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda( ), b_s_.cuda(), b_d.cuda() # action value distribution prediction q_eval, q_eval_tau = self.pred_net( b_s) # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1) mb_size = q_eval.size(0) # squeeze去掉第一维 # torch.stack函数是将矩阵进行叠加,默认dim=0,即将[]中的n个矩阵变成n维 # index_select函数是进行索引查找。 q_eval = torch.stack([ q_eval[i].index_select(0, b_a[i]) for i in range(mb_size) ]).squeeze(1) # (m, N_QUANT) # 在q_eval第二维后面加一个维度 q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1) # note that dim 1 is for present quantile, dim 2 is for next quantile # get next state value q_next, q_next_tau = self.target_net( b_s_) # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1) best_actions = q_next.mean(dim=2).argmax(dim=1) # (m) q_next = torch.stack([ q_next[i].index_select(0, best_actions[i]) for i in range(mb_size) ]).squeeze(1) # q_nest: (m, N_QUANT) # q_target = R + gamma * (1 - terminate) * q_next q_target = b_r.unsqueeze(1) + GAMMA * (1. - b_d.unsqueeze(1)) * q_next # q_target: (m, N_QUANT) # detach表示该Variable不更新参数 q_target = q_target.unsqueeze(1).detach() # (m , 1, N_QUANT) # quantile Huber loss print('q_target', q_target.shape) print('q_eval', q_eval.shape) print('q_target_', q_target.detach().shape) u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT) tau = q_eval_tau.unsqueeze(0) # (1, N_QUANT, 1) # note that tau is for present quantile # w = |tau - delta(u<0)| weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT) loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none') # (m, N_QUANT, N_QUANT) loss = torch.mean(weight * loss, dim=1).mean(dim=1) # calculate importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync eval target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step conter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) # discrete values self.value_range = torch.FloatTensor(V_RANGE) # (N_ATOM) if USE_GPU: self.value_range = self.value_range.cuda() def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): x = torch.FloatTensor(x) if USE_GPU: x = x.cuda() if np.random.uniform() >= EPSILON: # greedy case action_value_dist = self.pred_net(x) # (N_ENVS, N_ACTIONS, N_ATOM) action_value = torch.sum(action_value_dist * self.value_range.view(1, 1, -1), dim=2) # (N_ENVS, N_ACTIONS) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_s_ = torch.FloatTensor(b_s_) if USE_GPU: b_s, b_a, b_s_ = b_s.cuda(), b_a.cuda(), b_s_.cuda() # action value distribution prediction q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_ATOM) mb_size = q_eval.size(0) q_eval = torch.stack([ q_eval[i].index_select(0, b_a[i]) for i in range(mb_size) ]).squeeze(1) # (m, N_ATOM) # target distribution q_target = np.zeros((mb_size, N_ATOM)) # (m, N_ATOM) # get next state value q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_ATOM) # next value mean q_next_mean = torch.sum(q_next * self.value_range.view(1, 1, -1), dim=2) # (m, N_ACTIONS) best_actions = q_next_mean.argmax(dim=1) # (m) q_next = torch.stack([ q_next[i].index_select(0, best_actions[i]) for i in range(mb_size) ]).squeeze(1) q_next = q_next.data.cpu().numpy() # (m, N_ATOM) # categorical projection ''' next_v_range : (z_j) i.e. values of possible return, shape : (m, N_ATOM) next_v_pos : relative position when offset of value is V_MIN, shape : (m, N_ATOM) ''' # we vectorized the computation of support and position next_v_range = np.expand_dims(b_r, 1) + GAMMA * np.expand_dims((1. - b_d),1) \ * np.expand_dims(self.value_range.data.cpu().numpy(),0) next_v_pos = np.zeros_like(next_v_range) # clip for categorical distribution next_v_range = np.clip(next_v_range, V_MIN, V_MAX) # calc relative position of possible value next_v_pos = (next_v_range - V_MIN) / V_STEP # get lower/upper bound of relative position lb = np.floor(next_v_pos).astype(int) ub = np.ceil(next_v_pos).astype(int) # we didn't vectorize the computation of target assignment. for i in range(mb_size): for j in range(N_ATOM): # calc prob mass of relative position weighted with distance q_target[i, lb[i, j]] += (q_next * (ub - next_v_pos))[i, j] q_target[i, ub[i, j]] += (q_next * (next_v_pos - lb))[i, j] q_target = torch.FloatTensor(q_target) if USE_GPU: q_target = q_target.cuda() # calc huber loss, dont reduce for importance weight loss = q_target * (-torch.log(q_eval + 1e-8)) # (m , N_ATOM) loss = torch.mean(loss) # calc importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class Agent(): """ Initialize Agent, inclduing: DQN Hyperparameters Local and Targat State-Action Policy Networks Replay Memory Buffer from Replay Buffer Class (define below) """ def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate, betas=BETAS) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ######################################################## # STEP() method # def step(self, state, action, reward, next_state, done, update=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() if update: self.learn(experiences, self.gamma) ######################################################## # ACT() method # def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) ######################################################## # LEARN() method # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) ######################################################## """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_the_model(self, iteration, f_name): if not os.path.exists('./save/dqn/'): os.makedirs('./save/dqn/') f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth' torch.save(self.network.state_dict(), './save/dqn/' + f_name) print('DQN Model Saved') def load_the_model(self, iteration, f_name): f_path = './save/dqn/dqn_param_' + str( iteration) + '_' + f_name + '_model.pth' self.network.load_state_dict(torch.load(f_path)) print('DQN Model Loaded')
class Smoothing_DQN(object): def __init__(self): self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet() self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet() # sync evac target self.target_deque1 = deque(maxlen=n) self.target_deque2 = deque(maxlen=n) self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0) self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0) self.target_deque1.append(self.target_net_Q1) # use gpu if USE_GPU: self.pred_net_Q1.cuda() self.target_net_Q1.cuda() self.pred_net_Q2.cuda() self.target_net_Q2.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # loss function self.loss_function = nn.MSELoss() # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR) self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net_Q1.save(PRED_PATH) self.target_net_Q1.save(TARGET_PATH) self.pred_net_Q2.save(PRED_PATH1) self.target_net_Q2.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net_Q1.load(PRED_PATH) self.target_net_Q1.load(TARGET_PATH) self.pred_net_Q2.load(PRED_PATH) self.target_net_Q2.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy策略 if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net_Q1(x) action_value += self.pred_net_Q2(x) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def save_history(self): if self.memory_counter % dealy_interval == 0: self.target_deque1.append(self.pred_net_Q1) if self.memory_counter % dealy_interval + 100 == 0: self.target_deque2.append(self.pred_net_Q2) # def update_target(self): # # weight=np.array([0.9,0.]) # if len(self.target_deque)<n: # for target_param, pred_param in zip(self.target_net.parameters(), self.pred_net.parameters()): # target_param.data.copy_((1.0 - 1e-2) \ # * target_param.data + 1e-2 * pred_param.data) # return # for i,net in enumerate(self.target_deque): # for target_param, queue_net in zip(self.target_net.parameters(),net.parameters()): # target_param.data.copy_( self.weight[i] * queue_net.data) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net_Q1, self.pred_net_Q1, 1e-2) self.update_target(self.target_net_Q2, self.pred_net_Q2, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) # b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda( ), b_s_.cuda(), b_d.cuda() # action value for current state q_eval1 = self.pred_net_Q1(b_s) mb_size = q_eval1.size(0) q_eval1 = torch.stack([q_eval1[i][b_a[i]] for i in range(mb_size)]) q_eval2 = self.pred_net_Q2(b_s) mb_size = q_eval2.size(0) q_eval2 = torch.stack([q_eval2[i][b_a[i]] for i in range(mb_size)]) # optimal action value for current state alpha = np.random.uniform(0, 1, len(self.target_deque1) + 1) alpha = alpha / alpha.sum() # print("alpha:",alpha,alpha.sum()) q_next1 = self.target_net_Q1(b_s_) q_next1 = alpha[-1] * torch.max(q_next1, -1)[0] for i, target in enumerate(self.target_deque1): q_next_history = target(b_s_) q_next1 += alpha[i] * torch.max(q_next_history, -1)[0] alpha = np.random.uniform(0, 1, len(self.target_deque2) + 1) alpha = alpha / alpha.sum() # print("alpha:",alpha,alpha.sum()) q_next2 = self.target_net_Q2(b_s_) q_next2 = alpha[-1] * torch.max(q_next2, -1)[0] for i, target in enumerate(self.target_deque2): q_next_history = target(b_s_) q_next2 += alpha[i] * torch.max(q_next_history, -1)[0] # print("q next:",q_next.shape) # best_actions = q_next.argmax(dim=1) # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)]) # print("shape:",q_next.shape) q_target1 = b_r + GAMMA * (1. - b_d) * q_next1 q_target1 = q_target1.detach() q_target2 = b_r + GAMMA * (1. - b_d) * q_next2 q_target2 = q_target2.detach() # loss loss = self.loss_function(q_eval1, q_target2) logger.store(loss=loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() loss = self.loss_function(q_eval2, q_target1) self.optimizer1.zero_grad() loss.backward() self.optimizer1.step() return loss
class DQN(object): def __init__(self): if USE_CNN: if USE_GPU: self.eval_net, self.target_net = ConvNet().cuda(), ConvNet( ).cuda() else: self.eval_net, self.target_net = ConvNet(), ConvNet() else: if USE_GPU: self.eval_net, self.target_net = Net().cuda(), Net().cuda() else: self.eval_net, self.target_net = Net(), Net() self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # Create the replay buffer if MEMORY_MODE == 'PER': self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY, alpha=PER_ALPHA) else: self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) def choose_action(self, x, EPSILON): if USE_GPU: x = Variable(torch.FloatTensor(x)).cuda() else: x = Variable(torch.FloatTensor(x)) # input only one sample if np.random.uniform() < EPSILON: # greedy actions_value = self.eval_net.forward(x.unsqueeze(0)) if USE_GPU: action = torch.argmax( actions_value).data.cpu().numpy() # return the argmax else: action = torch.argmax( actions_value).data.numpy() # return the argmax; else: # random action = np.random.randint(0, N_ACTIONS) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self, beta): # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if MEMORY_MODE == 'PER': experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta) (b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done, b_weights, b_idxes) = experience else: b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done = self.replay_buffer.sample( BATCH_SIZE) b_weights, b_idxes = np.ones_like(b_reward_memory), None if USE_GPU: b_s = Variable(torch.FloatTensor(b_state_memory)).cuda() b_a = Variable(torch.LongTensor(b_action_memory)).cuda() b_r = Variable(torch.FloatTensor(b_reward_memory)).cuda() b_s_ = Variable(torch.FloatTensor(b_next_state_memory)).cuda() b_d = Variable(torch.FloatTensor(b_done)).cuda() else: b_s = Variable(torch.FloatTensor(b_state_memory)) b_a = Variable(torch.LongTensor(b_action_memory)) b_r = Variable(torch.FloatTensor(b_reward_memory)) b_s_ = Variable(torch.FloatTensor(b_next_state_memory)) b_d = Variable(torch.FloatTensor(b_done)) # q_eval w.r.t the action in experience q_eval = self.eval_net(b_s).gather(1, b_a.unsqueeze(1)).view( -1) # shape (batch, 1) if DOUBLE: _, best_actions = self.eval_net.forward(b_s_).detach().max(1) q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * (1. - b_d) * q_next.gather( 1, best_actions.unsqueeze(1)).squeeze(1) # shape (batch, 1) else: q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * ( 1. - b_d) * q_next.max(1)[0] # shape (batch, 1) loss = F.smooth_l1_loss(q_eval, q_target, reduce=False) loss = torch.mean(torch.Tensor(b_weights).cuda() * loss) td_error = (q_target - q_eval).data.cpu().numpy() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_net.parameters(), 10.) self.optimizer.step() if MEMORY_MODE == 'PER': new_priorities = np.abs(td_error) + PER_EPSILON self.replay_buffer.update_priorities(b_idxes, new_priorities) def save_model(self): # save evaluation network and target network simultaneously self.eval_net.save(EVAL_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load evaluation network and target network simultaneously self.eval_net.load(EVAL_PATH) self.target_net.load(TARGET_PATH)
class Agent(): def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two neural network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, actions, rewards, next_state, dones): # Save experience in replay memory for i in range(len(actions)): # print("Step ACTIONS", actions, actions[i], state[i]) self.memory.add(state[i], actions[i], rewards[i], next_state[i], dones[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() num_agents = len(action_values[0]) # print("AGENT ACT VALUES", action_values, np.argmax(action_values.cpu().data.numpy()[0], 1), np.array([random.choice(np.arange(self.action_size)) for i in range(num_agents)])) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()[0], 1) else: return np.array( np.array([ random.choice(np.arange(self.action_size)) for i in range(num_agents) ])) # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # print(Qsa, Qsa_targets) # print(loss) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class QR_DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync eval target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step conter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): x = torch.FloatTensor(x) if USE_GPU: x = x.cuda() if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net(x).mean(dim=2) # (N_ENVS, N_ACTIONS) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r,b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda() # action value distribution prediction q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_QUANT) mb_size = q_eval.size(0) q_eval = torch.stack([q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)]).squeeze(1) # (m, N_QUANT) q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1) # note that dim 1 is for present quantile, dim 2 is for next quantile # get next state value q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_QUANT) best_actions = q_next.mean(dim=2).argmax(dim=1) # (m) q_next = torch.stack([q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)]).squeeze(1) # (m, N_QUANT) q_target = b_r.unsqueeze(1) + GAMMA * (1. -b_d.unsqueeze(1)) * q_next # (m, N_QUANT) q_target = q_target.unsqueeze(1) # (m , 1, N_QUANT) # quantile Huber loss u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT) tau = torch.FloatTensor(QUANTS_TARGET).view(1, -1, 1) # (1, N_QUANT, 1) # note that tau is for present quantile if USE_GPU: tau = tau.cuda() weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT) loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none') # (m, N_QUANT, N_QUANT) loss = torch.mean(weight * loss, dim=1).mean(dim=1) print('1',loss.shape) # calc importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() # loos = b_w * loss print('2',(b_w * loss).shape) loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.pred_net.parameters(),0.1) self.optimizer.step()
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(10): # update 10 times per learning experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.running_c_loss += float(critic_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.running_a_loss += float(actor_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent(): def __init__(self, input_shape, action_size, buffer_size, batch_size, gamma, lr, tau, update_every, device): """Initialize an Agent object. Params ====== input_shape (tuple): dimension of each state action_size (int): dimension of each action buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor lr (float): learning rate tau (float): Soft-parameter update update_every (int): how often to update the network device(string): Use Gpu or CPU """ self.input_shape = input_shape self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.lr = lr self.update_every = update_every self.tau = tau self.device = device # Q-Network self.policy_net = DQNLinear(input_shape, action_size).to(self.device) self.target_net = DQNLinear(input_shape, action_size).to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.device) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.01): state = torch.from_numpy(state).unsqueeze(0).to(self.device) self.policy_net.eval() with torch.no_grad(): action_values = self.policy_net(state) self.policy_net.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get expected Q values from policy model Q_expected_current = self.policy_net(states) Q_expected = Q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1) # Get max predicted Q values (for next states) from target model Q_targets_next = self.target_net(next_states).detach().max(1)[0] # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.policy_net, self.target_net, self.tau) # θ'=θ×τ+θ'×(1−τ) def soft_update(self, policy_model, target_model, tau): for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()): target_param.data.copy_(tau * policy_param.data + (1.0 - tau) * target_param.data) def load_model(self, path): checkpoint = torch.load(path) self.policy_net.load_state_dict(checkpoint['state_dict']) self.target_net.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) scores = checkpoint['scores'] return scores def save_model(self, path, scores): model = { "state_dict": self.policy_net.state_dict(), "optimizer": self.optimizer.state_dict(), "scores": scores } torch.save(model, path)
class AgentD4PG(): """ Agent implementing noisy agent """ def __init__(self, state_size, action_size, seed, device=device, epsilon=0.3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.epsilon = epsilon self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = DDPGActor(state_size, action_size, seed).to(device) self.actor_target = DDPGActor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = D4PGCritic(state_size, action_size, seed, N_ATOMS, Vmin, Vmax).to(device) self.critic_target = D4PGCritic(state_size, action_size, seed, N_ATOMS, Vmin, Vmax).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, states, mode): states_v = torch.Tensor(np.array(states, dtype=np.float32)).to(self.device) self.actor_local.eval() with torch.no_grad(): mu_v = self.actor_local(states_v) actions = mu_v.data.cpu().numpy() self.actor_local.train() if mode == "test": return np.clip(actions, -1, 1) elif mode == "train": actions += self.epsilon * np.random.normal(size=actions.shape) actions = np.clip(actions, -1, 1) return actions def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(10): # update 10 times per learning experiences = self.memory.sample2() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # crt_distr_v = self.critic_local(states, actions) last_act_v = self.actor_target(next_states) last_distr_v = F.softmax(self.critic_target(next_states, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards, dones, gamma=gamma**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() self.running_c_loss += float(critic_loss_v.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() critic_loss_v.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) crt_distr_v = self.critic_local(states, actions_pred) actor_loss_v = -self.critic_local.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() self.running_a_loss += float(actor_loss_v.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss_v.backward() # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)