class Agent(): """ Initialize Agent, inclduing: DQN Hyperparameters Local and Targat State-Action Policy Networks Replay Memory Buffer from Replay Buffer Class (define below) """ def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate, betas=BETAS) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ######################################################## # STEP() method # def step(self, state, action, reward, next_state, done, update=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() if update: self.learn(experiences, self.gamma) ######################################################## # ACT() method # def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) ######################################################## # LEARN() method # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) ######################################################## """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_the_model(self, iteration, f_name): if not os.path.exists('./save/dqn/'): os.makedirs('./save/dqn/') f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth' torch.save(self.network.state_dict(), './save/dqn/' + f_name) print('DQN Model Saved') def load_the_model(self, iteration, f_name): f_path = './save/dqn/dqn_param_' + str( iteration) + '_' + f_name + '_model.pth' self.network.load_state_dict(torch.load(f_path)) print('DQN Model Loaded')
class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync evac target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) # Update target network def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy if np.random.uniform() >= EPSILON: # greedy case action_value, tau = self.pred_net( x) # (N_ENVS, N_ACTIONS, N_QUANT) action_value = action_value.mean(dim=2) action = torch.argmax(action_value, dim=1).data.cpu().numpy() # print(action) else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) print(b_d) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda( ), b_s_.cuda(), b_d.cuda() # action value distribution prediction q_eval, q_eval_tau = self.pred_net( b_s) # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1) mb_size = q_eval.size(0) # squeeze去掉第一维 # torch.stack函数是将矩阵进行叠加,默认dim=0,即将[]中的n个矩阵变成n维 # index_select函数是进行索引查找。 q_eval = torch.stack([ q_eval[i].index_select(0, b_a[i]) for i in range(mb_size) ]).squeeze(1) # (m, N_QUANT) # 在q_eval第二维后面加一个维度 q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1) # note that dim 1 is for present quantile, dim 2 is for next quantile # get next state value q_next, q_next_tau = self.target_net( b_s_) # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1) best_actions = q_next.mean(dim=2).argmax(dim=1) # (m) q_next = torch.stack([ q_next[i].index_select(0, best_actions[i]) for i in range(mb_size) ]).squeeze(1) # q_nest: (m, N_QUANT) # q_target = R + gamma * (1 - terminate) * q_next q_target = b_r.unsqueeze(1) + GAMMA * (1. - b_d.unsqueeze(1)) * q_next # q_target: (m, N_QUANT) # detach表示该Variable不更新参数 q_target = q_target.unsqueeze(1).detach() # (m , 1, N_QUANT) # quantile Huber loss print('q_target', q_target.shape) print('q_eval', q_eval.shape) print('q_target_', q_target.detach().shape) u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT) tau = q_eval_tau.unsqueeze(0) # (1, N_QUANT, 1) # note that tau is for present quantile # w = |tau - delta(u<0)| weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT) loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none') # (m, N_QUANT, N_QUANT) loss = torch.mean(weight * loss, dim=1).mean(dim=1) # calculate importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
class Agent: def __init__(self, state_size, action_size, num_agents): self.policy = PolicyNetwork(state_size, action_size).to(device) self.old_policy = PolicyNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR) self.episodes = [Episode() for _ in range(num_agents)] self.memory = ReplayBuffer(BUFFER_SIZE) self.t_step = 0 def reset(self): self.finished = [False] * len(self.episodes) # Decide on an action to take in the environment def act(self, state, eps=None): self.policy.eval() with torch.no_grad(): output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device)) return Categorical(output).sample().item() # Record the results of the agent's action and update the model def step(self, handle, state, action, next_state, agent_done, episode_done, collision): if not self.finished[handle]: if agent_done: reward = 1 elif collision: reward = -.5 else: reward = 0 # Push experience into Episode memory self.episodes[handle].push(state, action, reward, next_state, agent_done or episode_done) # When we finish the episode, discount rewards and push the experience into replay memory if agent_done or episode_done: self.episodes[handle].discount_rewards(GAMMA) self.memory.push_episode(self.episodes[handle]) self.episodes[handle].reset() self.finished[handle] = True # Perform a gradient update every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4: self.train(*self.memory.sample(BATCH_SIZE, device)) def train(self, states, actions, rewards, next_state, done): self.policy.train() responsible_outputs = torch.gather(self.policy(states), 1, actions) old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach() # rewards = rewards - rewards.mean() ratio = responsible_outputs / (old_responsible_outputs + 1e-5) clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR) loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean() # Compute loss and perform a gradient step self.old_policy.load_state_dict(self.policy.state_dict()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Checkpointing methods def save(self, path, *data): torch.save(self.policy.state_dict(), path / 'ppo/model_checkpoint.policy') torch.save(self.optimizer.state_dict(), path / 'ppo/model_checkpoint.optimizer') with open(path / 'ppo/model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.policy.load_state_dict(torch.load(path / 'ppo/model_checkpoint.policy')) self.optimizer.load_state_dict(torch.load(path / 'ppo/model_checkpoint.optimizer')) with open(path / 'ppo/model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults
class Agent: def __init__(self, state_size, action_size, num_agents, double_dqn=False): self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE) self.num_agents = num_agents self.t_step = 0 def reset(self): self.finished = [False] * self.num_agents # Decide on an action to take in the environment def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Epsilon-greedy action selection if random.random() > eps: return torch.argmax(action_values).item() else: return torch.randint(self.action_size, ()).item() # Record the results of the agent's action and update the model def step(self, handle, state, action, reward, next_state, agent_done): if not self.finished[handle]: # Save experience in replay memory self.memory.push(state, action, reward, next_state, agent_done) self.finished[handle] = agent_done # Perform a gradient update every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320 self.learn(*self.memory.sample(BATCH_SIZE, device)) def learn(self, states, actions, rewards, next_states, dones): self.qnetwork_local.train() # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: Q_best_action = self.qnetwork_local(next_states).argmax(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1)) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones) # Compute loss and perform a gradient step self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() self.lr_scheduler.step() # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()` for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) # Checkpointing methods def save(self, path, *data): torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local') torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target') torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer') with open(path / 'model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local')) self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target')) self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer')) with open(path / 'model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults
class Agent(): def __init__(self): self.name = "expected_sarsa_agent" def agent_init(self, agent_config): self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], agent_config['minibatch_sz'], agent_config.get("seed")) self.network = ActionValueNetwork(agent_config['network_config']) self.optimizer = Adam(self.network.layer_sizes, agent_config["optimizer_config"]) self.num_actions = agent_config['network_config']['num_actions'] self.num_replay = agent_config['num_replay_updates_per_step'] self.discount = agent_config['gamma'] self.tau = agent_config['tau'] self.rand_generator = np.random.RandomState(agent_config.get("seed")) self.last_state = None self.last_action = None self.sum_rewards = 0 self.episode_steps = 0 def policy(self, state): action_values = self.network.get_action_values(state) probs_batch = softmax(action_values, self.tau) action = self.rand_generator.choice(self.num_actions, p=probs_batch.squeeze()) return action def agent_start(self, state): self.sum_rewards = 0 self.episode_steps = 0 self.last_state = np.array([state]) self.last_action = self.policy(self.last_state) return self.last_action def agent_step(self, reward, state): self.sum_rewards += reward self.episode_steps += 1 state = np.array([state]) action = self.policy(state) self.replay_buffer.append(self.last_state, self.last_action, reward, 0, state) # Perform replay steps: if self.replay_buffer.size() > self.replay_buffer.minibatch_size: current_q = deepcopy(self.network) for _ in range(self.num_replay): # Get sample experiences from the replay buffer experiences = self.replay_buffer.sample() # Call optimize_network to update the weights of the network optimize_network(experiences, self.discount, self.optimizer, self.network, current_q, self.tau) # Update the last state and last action. self.last_state = state self.last_action = action return action # update of the weights using optimize_network def agent_end(self, reward): self.sum_rewards += reward self.episode_steps += 1 # Set terminal state to an array of zeros state = np.zeros_like(self.last_state) # Append new experience to replay buffer self.replay_buffer.append(self.last_state, self.last_action, reward, 1, state) # Perform replay steps: if self.replay_buffer.size() > self.replay_buffer.minibatch_size: current_q = deepcopy(self.network) for _ in range(self.num_replay): # Get sample experiences from the replay buffer experiences = self.replay_buffer.sample() # Call optimize_network to update the weights of the network optimize_network(experiences, self.discount, self.optimizer, self.network, current_q, self.tau) def agent_message(self, message): if message == "get_sum_reward": return self.sum_rewards else: raise Exception("Unrecognized Message!")
class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync eval target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step conter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) # discrete values self.value_range = torch.FloatTensor(V_RANGE) # (N_ATOM) if USE_GPU: self.value_range = self.value_range.cuda() def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): x = torch.FloatTensor(x) if USE_GPU: x = x.cuda() if np.random.uniform() >= EPSILON: # greedy case action_value_dist = self.pred_net(x) # (N_ENVS, N_ACTIONS, N_ATOM) action_value = torch.sum(action_value_dist * self.value_range.view(1, 1, -1), dim=2) # (N_ENVS, N_ACTIONS) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_s_ = torch.FloatTensor(b_s_) if USE_GPU: b_s, b_a, b_s_ = b_s.cuda(), b_a.cuda(), b_s_.cuda() # action value distribution prediction q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_ATOM) mb_size = q_eval.size(0) q_eval = torch.stack([ q_eval[i].index_select(0, b_a[i]) for i in range(mb_size) ]).squeeze(1) # (m, N_ATOM) # target distribution q_target = np.zeros((mb_size, N_ATOM)) # (m, N_ATOM) # get next state value q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_ATOM) # next value mean q_next_mean = torch.sum(q_next * self.value_range.view(1, 1, -1), dim=2) # (m, N_ACTIONS) best_actions = q_next_mean.argmax(dim=1) # (m) q_next = torch.stack([ q_next[i].index_select(0, best_actions[i]) for i in range(mb_size) ]).squeeze(1) q_next = q_next.data.cpu().numpy() # (m, N_ATOM) # categorical projection ''' next_v_range : (z_j) i.e. values of possible return, shape : (m, N_ATOM) next_v_pos : relative position when offset of value is V_MIN, shape : (m, N_ATOM) ''' # we vectorized the computation of support and position next_v_range = np.expand_dims(b_r, 1) + GAMMA * np.expand_dims((1. - b_d),1) \ * np.expand_dims(self.value_range.data.cpu().numpy(),0) next_v_pos = np.zeros_like(next_v_range) # clip for categorical distribution next_v_range = np.clip(next_v_range, V_MIN, V_MAX) # calc relative position of possible value next_v_pos = (next_v_range - V_MIN) / V_STEP # get lower/upper bound of relative position lb = np.floor(next_v_pos).astype(int) ub = np.ceil(next_v_pos).astype(int) # we didn't vectorize the computation of target assignment. for i in range(mb_size): for j in range(N_ATOM): # calc prob mass of relative position weighted with distance q_target[i, lb[i, j]] += (q_next * (ub - next_v_pos))[i, j] q_target[i, ub[i, j]] += (q_next * (next_v_pos - lb))[i, j] q_target = torch.FloatTensor(q_target) if USE_GPU: q_target = q_target.cuda() # calc huber loss, dont reduce for importance weight loss = q_target * (-torch.log(q_eval + 1e-8)) # (m , N_ATOM) loss = torch.mean(loss) # calc importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class Smoothing_DQN(object): def __init__(self): self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet() self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet() # sync evac target self.target_deque1 = deque(maxlen=n) self.target_deque2 = deque(maxlen=n) self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0) self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0) self.target_deque1.append(self.target_net_Q1) # use gpu if USE_GPU: self.pred_net_Q1.cuda() self.target_net_Q1.cuda() self.pred_net_Q2.cuda() self.target_net_Q2.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # loss function self.loss_function = nn.MSELoss() # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR) self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net_Q1.save(PRED_PATH) self.target_net_Q1.save(TARGET_PATH) self.pred_net_Q2.save(PRED_PATH1) self.target_net_Q2.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net_Q1.load(PRED_PATH) self.target_net_Q1.load(TARGET_PATH) self.pred_net_Q2.load(PRED_PATH) self.target_net_Q2.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy策略 if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net_Q1(x) action_value += self.pred_net_Q2(x) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def save_history(self): if self.memory_counter % dealy_interval == 0: self.target_deque1.append(self.pred_net_Q1) if self.memory_counter % dealy_interval + 100 == 0: self.target_deque2.append(self.pred_net_Q2) # def update_target(self): # # weight=np.array([0.9,0.]) # if len(self.target_deque)<n: # for target_param, pred_param in zip(self.target_net.parameters(), self.pred_net.parameters()): # target_param.data.copy_((1.0 - 1e-2) \ # * target_param.data + 1e-2 * pred_param.data) # return # for i,net in enumerate(self.target_deque): # for target_param, queue_net in zip(self.target_net.parameters(),net.parameters()): # target_param.data.copy_( self.weight[i] * queue_net.data) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net_Q1, self.pred_net_Q1, 1e-2) self.update_target(self.target_net_Q2, self.pred_net_Q2, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) # b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda( ), b_s_.cuda(), b_d.cuda() # action value for current state q_eval1 = self.pred_net_Q1(b_s) mb_size = q_eval1.size(0) q_eval1 = torch.stack([q_eval1[i][b_a[i]] for i in range(mb_size)]) q_eval2 = self.pred_net_Q2(b_s) mb_size = q_eval2.size(0) q_eval2 = torch.stack([q_eval2[i][b_a[i]] for i in range(mb_size)]) # optimal action value for current state alpha = np.random.uniform(0, 1, len(self.target_deque1) + 1) alpha = alpha / alpha.sum() # print("alpha:",alpha,alpha.sum()) q_next1 = self.target_net_Q1(b_s_) q_next1 = alpha[-1] * torch.max(q_next1, -1)[0] for i, target in enumerate(self.target_deque1): q_next_history = target(b_s_) q_next1 += alpha[i] * torch.max(q_next_history, -1)[0] alpha = np.random.uniform(0, 1, len(self.target_deque2) + 1) alpha = alpha / alpha.sum() # print("alpha:",alpha,alpha.sum()) q_next2 = self.target_net_Q2(b_s_) q_next2 = alpha[-1] * torch.max(q_next2, -1)[0] for i, target in enumerate(self.target_deque2): q_next_history = target(b_s_) q_next2 += alpha[i] * torch.max(q_next_history, -1)[0] # print("q next:",q_next.shape) # best_actions = q_next.argmax(dim=1) # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)]) # print("shape:",q_next.shape) q_target1 = b_r + GAMMA * (1. - b_d) * q_next1 q_target1 = q_target1.detach() q_target2 = b_r + GAMMA * (1. - b_d) * q_next2 q_target2 = q_target2.detach() # loss loss = self.loss_function(q_eval1, q_target2) logger.store(loss=loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() loss = self.loss_function(q_eval2, q_target1) self.optimizer1.zero_grad() loss.backward() self.optimizer1.step() return loss
class DQNAgent(): def __init__(self, input_shape, action_size, buffer_size, batch_size, gamma, lr, tau, update_every, device): """Initialize an Agent object. Params ====== input_shape (tuple): dimension of each state action_size (int): dimension of each action buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor lr (float): learning rate tau (float): Soft-parameter update update_every (int): how often to update the network device(string): Use Gpu or CPU """ self.input_shape = input_shape self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.lr = lr self.update_every = update_every self.tau = tau self.device = device # Q-Network self.policy_net = DQNLinear(input_shape, action_size).to(self.device) self.target_net = DQNLinear(input_shape, action_size).to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.device) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.01): state = torch.from_numpy(state).unsqueeze(0).to(self.device) self.policy_net.eval() with torch.no_grad(): action_values = self.policy_net(state) self.policy_net.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get expected Q values from policy model Q_expected_current = self.policy_net(states) Q_expected = Q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1) # Get max predicted Q values (for next states) from target model Q_targets_next = self.target_net(next_states).detach().max(1)[0] # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.policy_net, self.target_net, self.tau) # θ'=θ×τ+θ'×(1−τ) def soft_update(self, policy_model, target_model, tau): for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()): target_param.data.copy_(tau * policy_param.data + (1.0 - tau) * target_param.data) def load_model(self, path): checkpoint = torch.load(path) self.policy_net.load_state_dict(checkpoint['state_dict']) self.target_net.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) scores = checkpoint['scores'] return scores def save_model(self, path, scores): model = { "state_dict": self.policy_net.state_dict(), "optimizer": self.optimizer.state_dict(), "scores": scores } torch.save(model, path)
class DuelingDQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='temp/dqn/duelingdqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.mem_counter = 0 self.copy_counter = 0 self.replace_target_cnt = replace self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: # obs = np.array([obs]) state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() states, actions, rewards, new_states, dones = self.sample_memory() # print(f'---Actions shape: {actions.size()}') # print(f'---Actions: {actions}') indices = np.arange(self.mini_batchsize) q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.learning_network.forward(new_states) actions_selected = T.argmax( q_next, dim=1) # Action selection based on online weights q_eval = self.target_network.forward(new_states) q_eval[dones] = 0.0 #Actions' return value are evaluated q_target = rewards + self.gamma * q_eval[indices, actions_selected] cost = self.learning_network.loss(q_target, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % self.replace_target_cnt == 0: self.copy_target_network() self.copy_counter += 1 def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()
class Agent(): def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two neural network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, actions, rewards, next_state, dones): # Save experience in replay memory for i in range(len(actions)): # print("Step ACTIONS", actions, actions[i], state[i]) self.memory.add(state[i], actions[i], rewards[i], next_state[i], dones[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() num_agents = len(action_values[0]) # print("AGENT ACT VALUES", action_values, np.argmax(action_values.cpu().data.numpy()[0], 1), np.array([random.choice(np.arange(self.action_size)) for i in range(num_agents)])) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()[0], 1) else: return np.array( np.array([ random.choice(np.arange(self.action_size)) for i in range(num_agents) ])) # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # print(Qsa, Qsa_targets) # print(loss) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DeepQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon_min=0.1, checkpoint_dir='temp/dqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = 1.0 self.mem_counter = 0 self.copy_counter = 0 self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DeepQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DeepQNetwork(lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: # obs = np.array([obs]) state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() self.copy_target_network() states, actions, rewards, new_states, dones = self.sample_memory() # print(f'---Actions shape: {actions.size()}') # print(f'---Actions: {actions}') indices = np.arange(self.mini_batchsize) # q_pred = self.learning_network.forward(states)[:, actions] q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.target_network.forward(new_states).max(dim=1)[0] # dim=1 specifies take max along actions and [0] specifies taking the values instead of indices # print(f'---q_pred shape: {q_pred.size()}---') # print(f'---q_next shape: {q_next.size()}---') q_next[dones] = 0.0 targets = rewards + self.gamma * q_next cost = self.learning_network.loss(targets, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % 4 == 0: self.copy_target_network() self.copy_counter += 1 def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()
class QR_DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync eval target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step conter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): x = torch.FloatTensor(x) if USE_GPU: x = x.cuda() if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net(x).mean(dim=2) # (N_ENVS, N_ACTIONS) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r,b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda() # action value distribution prediction q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_QUANT) mb_size = q_eval.size(0) q_eval = torch.stack([q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)]).squeeze(1) # (m, N_QUANT) q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1) # note that dim 1 is for present quantile, dim 2 is for next quantile # get next state value q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_QUANT) best_actions = q_next.mean(dim=2).argmax(dim=1) # (m) q_next = torch.stack([q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)]).squeeze(1) # (m, N_QUANT) q_target = b_r.unsqueeze(1) + GAMMA * (1. -b_d.unsqueeze(1)) * q_next # (m, N_QUANT) q_target = q_target.unsqueeze(1) # (m , 1, N_QUANT) # quantile Huber loss u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT) tau = torch.FloatTensor(QUANTS_TARGET).view(1, -1, 1) # (1, N_QUANT, 1) # note that tau is for present quantile if USE_GPU: tau = tau.cuda() weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT) loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none') # (m, N_QUANT, N_QUANT) loss = torch.mean(weight * loss, dim=1).mean(dim=1) print('1',loss.shape) # calc importance weighted loss b_w = torch.Tensor(b_w) if USE_GPU: b_w = b_w.cuda() # loos = b_w * loss print('2',(b_w * loss).shape) loss = torch.mean(b_w * loss) # backprop loss self.optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.pred_net.parameters(),0.1) self.optimizer.step()
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(10): # update 10 times per learning experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.running_c_loss += float(critic_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.running_a_loss += float(actor_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=512, gamma=0.99, learning_rate=1e4, target_tau=1e3, update_rate=100, seed=0): #affect your agent vs other agents self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def load_model(self, path_model, path_target=None): params = torch.load(path_model) self.network.set_params(params) self.network.load_state_dict(torch.load(path_model)) if path_target != None: self.target_network.load_state_dict(torch.load(path_target)) def model_step(self, state, action, reward, next_state): # save experience in replay memory self.memory.add(state, action, reward, next_state) # learn every UPDATE_EVERY time steps self.t_step = self.t_step + 1 if self.t_step % self.update_rate == 0: # if enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def choose_action(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy() ) # return a number from 0 to action_size else: return random.choice(np.arange( self.action_size)) # return a number from 0 to action_size def learn(self, experiences, gamma, stp): states, actions, rewards, next_states = experiences # Get Q values from current observations (s,a) using model network # get max Q values for (s', a') from target model self.network.train() Q_sa = self.network(states).gather(1, actions) #print(Q_sa) Q_sa_prime_target_values = self.target_network(next_states).max( 1)[0].to(device).float().detach() #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1) #print(Q_sa_prime_target_values) # compute Q targets for current states #print(rewards) Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1) #print(Q_sa_targets) #input('train') #Q_sa_targets = Q_sa_targets.unsqueeze(1) # Compute loss (error) criterion = torch.nn.MSELoss(reduction='sum') loss = criterion( Q_sa.to(device), Q_sa_targets.to(device)) #F.mse_loss(Q_sa, Q_sa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network if stp % 100 == 0: print('Updating Model') self.soft_update(self.network, self.target_network, self.tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def Read(self): decision_steps, terminal_steps = env.get_steps(self.behavior_name) try: signal_front = np.array( sensor_front_sig( decision_steps.obs[0][self.index_player, :])) # 3 x 11 x 8 signal_back = np.array( sensor_back_sig( decision_steps.obs[1][self.index_player, :])) # 3 x 3 x 8 #pre_state = [] signal_front = np.array(signal_front) #print(signal_front.shape) #print(signal_back.shape) r = np.concatenate((signal_front, signal_back), axis=1) #print(r.shape) #input('ff') #pre_state.extend(list(np.array(signal_front).flatten())) #pre_state.extend(list(np.array(signal_back).flatten())) #state = np.array(pre_state) self.current_state = r count_close_to_ball = 0 count_touch_ball = 0 count_back_touch = 0 count_back_close = 0 self.rew_d_to_our_post = 0 self.rew_for_ball_dist = -0.1 # Front Observation for i in range(len(signal_front[0])): if signal_front[0][i][0] == 1.0: count_close_to_ball += 1 self.rew_for_ball_dist = max( 0.3 * (1 - signal_front[0][i][7]), self.rew_for_ball_dist) # Kicked the ball at the front if signal_front[0][i][7] <= 0.03: count_touch_ball += 1 if signal_front[0][i][1] == 1.0: self.rew_d_to_our_post = -0.1 if signal_front[0][i][2] == 1.0: self.rew_d_to_our_post = 0.1 # Back observation for i in range(len(signal_back[0])): if signal_back[0][i][0] == 1.0: count_back_close += 0.2 # Touches the ball at the back if signal_back[0][i][7] <= 0.03: count_back_touch += 0.3 self.back_touch = 1 if count_back_touch > 0 else 0.2 self.back_close = 1 if count_back_close > 0 else 0.1 # add reward if kick the ball self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15 # Penalize for back touching the ball if count_back_touch > 0: self.touch_ball_reward = -0.25 # Penalize if the ball is not in view self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05 # Penalize if the ball is behind the agent if count_back_close > 0: self.close_ball_reward = -0.1 return self.current_state except: self.touch_ball_reward = 0 self.close_ball_reward = 0 return self.current_state def upd_after_goal(self, n_upds): self.memory.upd_goal(n_upds) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def we_goll(self): self.memory.we_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def us_goll(self): self.memory.us_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step)
class DQN(object): def __init__(self): if USE_CNN: if USE_GPU: self.eval_net, self.target_net = ConvNet().cuda(), ConvNet( ).cuda() else: self.eval_net, self.target_net = ConvNet(), ConvNet() else: if USE_GPU: self.eval_net, self.target_net = Net().cuda(), Net().cuda() else: self.eval_net, self.target_net = Net(), Net() self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # Create the replay buffer if MEMORY_MODE == 'PER': self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY, alpha=PER_ALPHA) else: self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) def choose_action(self, x, EPSILON): if USE_GPU: x = Variable(torch.FloatTensor(x)).cuda() else: x = Variable(torch.FloatTensor(x)) # input only one sample if np.random.uniform() < EPSILON: # greedy actions_value = self.eval_net.forward(x.unsqueeze(0)) if USE_GPU: action = torch.argmax( actions_value).data.cpu().numpy() # return the argmax else: action = torch.argmax( actions_value).data.numpy() # return the argmax; else: # random action = np.random.randint(0, N_ACTIONS) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self, beta): # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if MEMORY_MODE == 'PER': experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta) (b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done, b_weights, b_idxes) = experience else: b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done = self.replay_buffer.sample( BATCH_SIZE) b_weights, b_idxes = np.ones_like(b_reward_memory), None if USE_GPU: b_s = Variable(torch.FloatTensor(b_state_memory)).cuda() b_a = Variable(torch.LongTensor(b_action_memory)).cuda() b_r = Variable(torch.FloatTensor(b_reward_memory)).cuda() b_s_ = Variable(torch.FloatTensor(b_next_state_memory)).cuda() b_d = Variable(torch.FloatTensor(b_done)).cuda() else: b_s = Variable(torch.FloatTensor(b_state_memory)) b_a = Variable(torch.LongTensor(b_action_memory)) b_r = Variable(torch.FloatTensor(b_reward_memory)) b_s_ = Variable(torch.FloatTensor(b_next_state_memory)) b_d = Variable(torch.FloatTensor(b_done)) # q_eval w.r.t the action in experience q_eval = self.eval_net(b_s).gather(1, b_a.unsqueeze(1)).view( -1) # shape (batch, 1) if DOUBLE: _, best_actions = self.eval_net.forward(b_s_).detach().max(1) q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * (1. - b_d) * q_next.gather( 1, best_actions.unsqueeze(1)).squeeze(1) # shape (batch, 1) else: q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * ( 1. - b_d) * q_next.max(1)[0] # shape (batch, 1) loss = F.smooth_l1_loss(q_eval, q_target, reduce=False) loss = torch.mean(torch.Tensor(b_weights).cuda() * loss) td_error = (q_target - q_eval).data.cpu().numpy() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_net.parameters(), 10.) self.optimizer.step() if MEMORY_MODE == 'PER': new_priorities = np.abs(td_error) + PER_EPSILON self.replay_buffer.update_priorities(b_idxes, new_priorities) def save_model(self): # save evaluation network and target network simultaneously self.eval_net.save(EVAL_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load evaluation network and target network simultaneously self.eval_net.load(EVAL_PATH) self.target_net.load(TARGET_PATH)
class DQN(object): def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync evac target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # loss function self.loss_function = nn.MSELoss() # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) def update_target(self, target, pred, update_rate): # update target network parameters using predcition network for target_param, pred_param in zip(target.parameters(), pred.parameters()): target_param.data.copy_((1.0 - update_rate) \ * target_param.data + update_rate*pred_param.data) def save_model(self): # save prediction network and target network self.pred_net.save(PRED_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load prediction network and target network self.pred_net.load(PRED_PATH) self.target_net.load(TARGET_PATH) def choose_action(self, x, EPSILON): # x:state x = torch.FloatTensor(x) # print(x.shape) if USE_GPU: x = x.cuda() # epsilon-greedy策略 if np.random.uniform() >= EPSILON: # greedy case action_value = self.pred_net(x) # (N_ENVS, N_ACTIONS, N_QUANT) action = torch.argmax(action_value, dim=1).data.cpu().numpy() else: # random exploration case action = np.random.randint(0, N_ACTIONS, (x.size(0))) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self): self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.update_target(self.target_net, self.pred_net, 1e-2) b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE) # b_w, b_idxes = np.ones_like(b_r), None b_s = torch.FloatTensor(b_s) b_a = torch.LongTensor(b_a) b_r = torch.FloatTensor(b_r) b_s_ = torch.FloatTensor(b_s_) b_d = torch.FloatTensor(b_d) if USE_GPU: b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda() # action value for current state q_eval = self.pred_net(b_s) mb_size = q_eval.size(0) q_eval = torch.stack([q_eval[i][b_a[i]] for i in range(mb_size)]) # optimal action value for current state q_next = self.target_net(b_s_) # best_actions = q_next.argmax(dim=1) # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)]) q_next = torch.max(q_next, -1)[0] q_target = b_r + GAMMA * (1. - b_d) * q_next q_target = q_target.detach() # loss loss = self.loss_function(q_eval, q_target) logger.store(loss=loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
class DoubleDQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='results\\doubledqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.replace_target_cnt = replace self.mem_counter = 0 self.copy_counter = 0 self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DeepQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=algo_name + '_' + env_name + '_' + 'learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DeepQNetwork(lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) self.loss_value = 0 self.writer = SummaryWriter(os.path.join(self.checkpoint_dir, 'logs')) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() states, actions, rewards, new_states, dones = self.sample_memory() indices = np.arange(self.mini_batchsize) q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.learning_network.forward(new_states) actions_selected = T.argmax( q_next, dim=1) # Action selection based on online weights q_eval = self.target_network.forward(new_states) q_eval[dones] = 0.0 #Actions' return value are evaluated q_target = rewards + self.gamma * q_eval[indices, actions_selected] cost = self.learning_network.loss(q_target, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % self.replace_target_cnt == 0: self.copy_target_network() self.copy_counter += 1 self.loss_value = cost def log(self, num_episode): diff = 0 for p_learning, p_target in zip(self.learning_network.parameters(), self.target_network.parameters()): p_learning = p_learning.data.cpu() p_target = p_target.data.cpu() diff += T.sum(p_learning - p_target) self.writer.add_scalar("td_error", self.loss_value, num_episode) self.writer.add_scalar("learning_target_diff", diff, num_episode) return diff def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()