class Agent(object): def __init__(self): self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet( ACTIONS_SIZE) self.memory = Memory(MEMORY_SIZE) self.learning_count = 0 self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) self.loss_func = nn.MSELoss() def action(self, state, israndom): if israndom and random.random() < EPSILON: return np.random.randint(0, ACTIONS_SIZE) state = torch.unsqueeze(torch.FloatTensor(state), 0) actions_value = self.network.forward(state) return torch.max(actions_value, 1)[1].data.numpy()[0] def learn(self, state, action, reward, next_state, done): old_val = self.network.forward(torch.FloatTensor([state])).gather( 1, torch.LongTensor([[action]]))[0] target_val = self.network.forward(torch.FloatTensor([state])) if done: done = 0 target = reward else: done = 1 target = reward + GAMMA * torch.max(target_val) error = abs(old_val[0] - target) self.memory.add(error.data, (state, action, reward, next_state, done)) if self.memory.tree.n_entries < MEMORY_THRESHOLD: return if self.learning_count % UPDATE_TIME == 0: self.target_network.load_state_dict(self.network.state_dict()) self.learning_count += 1 batch, idxs, is_weights = self.memory.sample(BATCH_SIZE) state = torch.FloatTensor([x[0] for x in batch]) action = torch.LongTensor([[x[1]] for x in batch]) reward = torch.FloatTensor([[x[2]] for x in batch]) next_state = torch.FloatTensor([x[3] for x in batch]) done = torch.FloatTensor([[x[4]] for x in batch]) eval_q = self.network.forward(state).gather(1, action) next_q = self.target_network(next_state).detach() target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done errors = torch.abs(eval_q - target_q).data.numpy().flatten() loss = self.loss_func(eval_q, target_q) for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class PERAgent(OffPolicyAgent): # construct agent's model separately, so it can be sized according to problem def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type = 'BC'): super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type) # reseed numpy, reset weights of network # Reset must be performed before every episode def reset(self,seed): # Reset time self.t=0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.n_features,self.env.nA) def generate_action(self, s, target_policy_sel = True): pval = self.target_policy[s] if target_policy_sel else self.behavior_policy[s] return np.random.choice(a=self.actions, p=pval) def generate_all_actions(self,target_policy_sel = True): return np.array([self.generate_action(item, target_policy_sel) for item in range(self.target_policy.shape[0])]) # Generate steps of experience def generate_experience(self, k=16): # Initialize environment s = self.env.reset() done = False steps = 0 # For each step while steps < k: # choose action according to behavior policy a = self.generate_action(s,False) # Take a step in environment based on chosen action (s2,r,done,_) = self.env.step(a) # Compute importance ratios ratio = self.target_policy[s,a] / self.behavior_policy[s,a] # states and target action for Computing TD Error current_state = self.construct_features([s]) next_state = self.construct_features([s2]) target_policy_action = self.generate_action(s,True) # Get bootstrap estimate of next state action values value_s = self.model.predict([current_state,np.zeros(current_state.shape[0])]) value_next_s = self.model.predict([next_state,np.zeros(next_state.shape[0])]) updated_val = r if done else (r + self.discount*value_next_s[0][target_policy_action]) # Compute TD error td_error = np.abs(updated_val - value_s[0][a]) # Stop execution if weights blow up - not converged if td_error > 10**5: return 1 # Add experience to IR replay buffer self.replay_buffer.add_per(td_error, (s,a,r,s2)) # Set for next step s=s2 self.t += 1 steps += 1 # If episode ends, reset environment if done: done = False s = self.env.reset() return 0 # do batch of training using replay buffer def train_batch(self, n_samples, batch_size): # Sample a minibatch from replay buffer data_samples, idxs, ratios, buffer_total = self.replay_buffer.sample(n_samples) # Extract rewards, states, next states, actions from samples rewards = extract_transition_components(data_samples, TransitionComponent.reward) next_states = extract_transition_components(data_samples, TransitionComponent.next_state) next_state_features = self.construct_features(next_states) states = extract_transition_components(data_samples, TransitionComponent.state) state_features = self.construct_features(states) actions = extract_transition_components(data_samples, TransitionComponent.action) # Calculate Target policy actions target_policy_actions = np.array([self.generate_action(state, True) for state in states]) # Calculate state values for TD error next_values_sa = self.model.predict([next_state_features, np.zeros(next_state_features.shape[0])]) next_values = np.choose(target_policy_actions,next_values_sa.T) # v(s') is zero for terminal state, so need to fix model prediction for i in range(n_samples): # if experience ends in terminal state, value function returns 0 if next_states[i] == -1 or next_states[i] == 10: #TODO this only works for randomwalk of size 10 next_values[i] = 0.0 # Compute targets by bootstrap estimates targets = (rewards + self.discount*next_values) # Compute error for updating priorities pred_values = self.model.predict([state_features, np.zeros(state_features.shape[0])]) final_targets = np.copy(pred_values) np.put_along_axis(final_targets, np.expand_dims(actions,axis = 1),targets[:,np.newaxis],axis = 1) pred = np.choose(actions, pred_values.T) error = np.abs(pred - targets) # Priority update for i in range(batch_size): self.replay_buffer.update(idxs[i], error[i]) # train on samples self.model.fit([state_features, ratios], final_targets, batch_size=batch_size, verbose=0)
class MAD4PG: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e6), batch_size=64, gamma=0.99, tau=1e-3, update_every=3, num_mc_steps=5, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = tau self.UPDATE_EVERY = update_every self.num_mc_steps = num_mc_steps self.experiences = [ ExperienceQueue(num_mc_steps) for _ in range(num_agents) ] self.memory = Memory(buffer_size) self.t_step = 0 self.train_start = batch_size self.mad4pg_agent = [ D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax), D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax) ] def acts(self, states, add_noise=0.0): acts = [] for s, a in zip(states, self.mad4pg_agent): acts.append(a.act(np.expand_dims(s, 0), add_noise)) return np.vstack(acts) # borrow from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter14 def distr_projection(self, next_distr_v, rewards_v, dones_mask_t, gamma): next_distr = next_distr_v.data.cpu().numpy() rewards = rewards_v.data.cpu().numpy() dones_mask = dones_mask_t.cpu().numpy().astype(np.bool) batch_size = len(rewards) proj_distr = np.zeros((batch_size, N_ATOMS), dtype=np.float32) dones_mask = np.squeeze(dones_mask) rewards = rewards.reshape(-1) for atom in range(N_ATOMS): tz_j = np.minimum( Vmax, np.maximum(Vmin, rewards + (Vmin + atom * DELTA_Z) * gamma)) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] ne_mask = u != l proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] if dones_mask.any(): proj_distr[dones_mask] = 0.0 tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones_mask])) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l if dones_mask.shape == (): if dones_mask: proj_distr[0, l] = 1.0 else: ne_mask = u != l proj_distr[0, l] = (u - b_j)[ne_mask] proj_distr[0, u] = (b_j - l)[ne_mask] else: eq_dones = dones_mask.copy() eq_dones[dones_mask] = eq_mask if eq_dones.any(): proj_distr[eq_dones, l[eq_mask]] = 1.0 ne_mask = u != l ne_dones = dones_mask.copy() ne_dones[dones_mask] = ne_mask if ne_dones.any(): proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] return torch.FloatTensor(proj_distr).to(device) def step(self, states, actions, rewards, next_states, dones): for agent_index in range(len(self.mad4pg_agent)): agent_experiences = self.experiences[agent_index] agent_experiences.states.appendleft(states[agent_index]) agent_experiences.rewards.appendleft(rewards[agent_index] * self.GAMMA**self.num_mc_steps) agent_experiences.actions.appendleft(actions[agent_index]) if len(agent_experiences.rewards) == self.num_mc_steps or dones[ agent_index]: # N-steps return: r= r1+gamma*r2+..+gamma^(t-1)*rt done_tensor = torch.tensor( dones[agent_index]).float().to(device) condition = True while condition: for i in range(len(agent_experiences.rewards)): agent_experiences.rewards[i] /= self.GAMMA state = torch.tensor( agent_experiences.states[-1]).float().unsqueeze(0).to( device) next_state = torch.tensor( next_states[agent_index]).float().unsqueeze(0).to( device) action = torch.tensor( agent_experiences.actions[-1]).float().unsqueeze(0).to( device) sum_reward = torch.tensor(sum( agent_experiences.rewards)).float().unsqueeze(0).to( device) with evaluating( self.mad4pg_agent[agent_index]) as cur_agent: q_logits_expected = cur_agent.critic_local( state, action) action_next = cur_agent.actor_target(next_state) q_target_logits_next = cur_agent.critic_target( next_state, action_next) q_target_distr_next = F.softmax(q_target_logits_next, dim=1) q_target_distr_next_projected = self.distr_projection( q_target_distr_next, sum_reward, done_tensor, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax( q_logits_expected, dim=1) * q_target_distr_next_projected error = cross_entropy.sum(dim=1).mean().cpu().data self.memory.add( error, (states[agent_index], actions[agent_index], sum_reward, next_states[agent_index], dones[agent_index])) agent_experiences.states.pop() agent_experiences.rewards.pop() agent_experiences.actions.pop() condition = False and dones[agent_index] and len( agent_experiences.states) > 0 if dones[agent_index]: agent_experiences.states.clear() agent_experiences.rewards.clear() agent_experiences.actions.clear() self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(self.memory.tree.n_entries) if self.memory.tree.n_entries > self.train_start: for agent_index in range(len(self.mad4pg_agent)): sampled_experiences, idxs = self.sample() self.learn(self.mad4pg_agent[agent_index], sampled_experiences, idxs) def sample(self): # prioritized experience replay mini_batch, idxs, is_weights = self.memory.sample(self.BATCH_SIZE) mini_batch = np.array(mini_batch).transpose() statess = np.vstack([m for m in mini_batch[0] if m is not None]) actionss = np.vstack([m for m in mini_batch[1] if m is not None]) rewardss = np.vstack([m for m in mini_batch[2] if m is not None]) next_statess = np.vstack([m for m in mini_batch[3] if m is not None]) doness = np.vstack([m for m in mini_batch[4] if m is not None]) # bool to binary doness = doness.astype(int) statess = torch.from_numpy(statess).float().to(device) actionss = torch.from_numpy(actionss).float().to(device) rewardss = torch.from_numpy(rewardss).float().to(device) next_statess = torch.from_numpy(next_statess).float().to(device) doness = torch.from_numpy(doness).float().to(device) return (statess, actionss, rewardss, next_statess, doness), idxs def learn(self, agent, experiences, idxs): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Compute critic loss q_logits_expected = agent.critic_local(states, actions) actions_next = agent.actor_target(next_states) q_targets_logits_next = agent.critic_target(next_states, actions_next) q_targets_distr_next = F.softmax(q_targets_logits_next, dim=1) q_targets_distr_projected_next = self.distr_projection( q_targets_distr_next, rewards, dones, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax(q_logits_expected, dim=1) * q_targets_distr_projected_next critic_loss = cross_entropy.sum(dim=1).mean() with torch.no_grad(): errors = cross_entropy.sum(dim=1).cpu().data.numpy() # update priority for i in range(self.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() agent.critic_optimizer.step() # Compute actor loss actions_pred = agent.actor_local(states) crt_distr_v = agent.critic_local(states, actions_pred) actor_loss = -agent.critic_local.distr_to_q(crt_distr_v) actor_loss = actor_loss.mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, self.TAU) agent.soft_update(agent.actor_local, agent.actor_target, self.TAU)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.memory = Memory( capacity=self.buffer_size) # internal memory using SumTree self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # Learn, if enough samples are available in memory if self.memory.tree.n_entries >= batch_size: experiences, idxs, is_weights = self.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: #action = [act + self.noise.sample() for act in action] action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights, batch_size=BATCH_SIZE, gamma=GAMMA): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #Loss calculation critic_loss = (torch.from_numpy(is_weights).float().to(device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #Introducing gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def add(self, state, action, reward, next_state, done, gamma=GAMMA): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(device) reward_torch = torch.unsqueeze( torch.from_numpy(np.array(reward)).float().to(device), 1) done_torch = torch.unsqueeze( torch.from_numpy(np.array(done).astype( np.uint8)).float().to(device), 1) state_torch = torch.from_numpy(state).float().to(device) action_torch = torch.from_numpy(action).float().to(device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer for i in np.arange(len(reward)): self.memory.add( error[i], (state[i], action[i], reward[i], next_state[i], done[i])) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, is_weights = self.memory.sample(self.batch_size) states = np.vstack([e[0] for e in experiences]) states = torch.from_numpy(states).float().to(device) actions = np.vstack([e[1] for e in experiences]) actions = torch.from_numpy(actions).float().to(device) rewards = np.vstack([e[2] for e in experiences]) rewards = torch.from_numpy(rewards).float().to(device) next_states = np.vstack([e[3] for e in experiences]) next_states = torch.from_numpy(next_states).float().to(device) dones = np.vstack([e[4] for e in experiences]).astype(np.uint8) dones = torch.from_numpy(dones).float().to(device) return (states, actions, rewards, next_states, dones), idxs, is_weights
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): state = torch.from_numpy(state).float().unsqueeze(0).to(device) next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() self.qnetwork_target.eval() with torch.no_grad(): target_action_values = self.qnetwork_target(next_state) expected_action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.qnetwork_target.train() old_val = expected_action_values[0][action] new_val = reward if not done: new_val += GAMMA * torch.max(target_action_values) error = abs(old_val - new_val) # Save experience in replay memory self.memory.add(error, (state, action, reward, next_state, done)) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ mini_batches, idxs, is_weights = experiences states = torch.from_numpy(np.vstack([mini_batch[0] for mini_batch in mini_batches])).float().to(device) actions = torch.from_numpy(np.vstack([mini_batch[1] for mini_batch in mini_batches])).long().to(device) rewards = torch.from_numpy(np.vstack([mini_batch[2] for mini_batch in mini_batches])).float().to(device) next_states = torch.from_numpy(np.vstack([mini_batch[3] for mini_batch in mini_batches])).float().to(device) dones = torch.from_numpy(np.vstack([int(mini_batch[4]) for mini_batch in mini_batches])).float().to(device) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" Q_source_next = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_target = self.qnetwork_target(next_states) Q_double_target = torch.tensor([Q_target[i][max_index] for i, max_index in enumerate(Q_source_next)]).detach().unsqueeze(1) Q_observed = rewards + (gamma * Q_double_target * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) errors = torch.abs(Q_expected - Q_observed).data.numpy() # update priority for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_observed)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQNAgent(): def __init__(self, state_size, action_size): self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.discount_factor = 0.99 self.learning_rate = 0.001 self.lr_step_size = 10 self.lr_gamma = 0.9 self.memory_size = 2**15 self.epsilon = 1.0 self.epsilon_min = 0.05 self.explore_step = 1000 self.epsilon_decay = 0.99995 self.batch_size = 64 self.train_start = 10000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.scheduler = StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_gamma) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/per_dqn') self.model.train() # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform_(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state).float() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(torch.tensor(state).float()).data old_val = target[0][action] target_val = self.target_model(torch.tensor(next_state).float()).data if done: target[0][action] = reward else: target[0][action] = reward + \ self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.tensor(states).float() pred = self.model(states) # one-hot encoding a = torch.tensor(actions, dtype=torch.long).view(-1, 1) one_hot_action = torch.zeros(self.batch_size, self.action_size) one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(one_hot_action), dim=1) # Q function of next state next_states = torch.tensor(next_states, dtype=torch.float) next_pred = self.target_model(next_states.float()).data rewards = torch.tensor(rewards, dtype=torch.float) dones = torch.tensor(dones, dtype=torch.float) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * \ self.discount_factor * next_pred.max(1)[0] errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.tensor(is_weights).float() * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step() return loss.item()
class DDQN_Agent: def __init__(self, useDepth=False): self.useDepth = useDepth self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 30000 self.gamma = 0.8 self.learning_rate = 0.001 self.batch_size = 512 self.memory = Memory(10000) self.max_episodes = 10000 self.save_interval = 2 self.test_interval = 10 self.network_update_interval = 10 self.episode = -1 self.steps_done = 0 self.max_steps = 34 self.policy = DQN() self.target = DQN() self.test_network = DQN() self.target.eval() self.test_network.eval() self.updateNetworks() self.env = DroneEnv(useDepth) self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) if torch.cuda.is_available(): print('Using device:', device) print(torch.cuda.get_device_name(0)) else: print("Using CPU") # LOGGING cwd = os.getcwd() self.save_dir = os.path.join(cwd, "saved models") if not os.path.exists(self.save_dir): os.mkdir("saved models") if not os.path.exists(os.path.join(cwd, "videos")): os.mkdir("videos") if torch.cuda.is_available(): self.policy = self.policy.to(device) # to use GPU self.target = self.target.to(device) # to use GPU self.test_network = self.test_network.to(device) # to use GPU # model backup files = glob.glob(self.save_dir + '\\*.pt') if len(files) > 0: files.sort(key=os.path.getmtime) file = files[-1] checkpoint = torch.load(file) self.policy.load_state_dict(checkpoint['state_dict']) self.episode = checkpoint['episode'] self.steps_done = checkpoint['steps_done'] self.updateNetworks() print("Saved parameters loaded" "\nModel: ", file, "\nSteps done: ", self.steps_done, "\nEpisode: ", self.episode) else: if os.path.exists("log.txt"): open('log.txt', 'w').close() if os.path.exists("last_episode.txt"): open('last_episode.txt', 'w').close() if os.path.exists("last_episode.txt"): open('saved_model_params.txt', 'w').close() self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) obs, _ = self.env.reset() tensor = self.transformToTensor(obs) writer.add_graph(self.policy, tensor) def updateNetworks(self): self.target.load_state_dict(self.policy.state_dict()) def transformToTensor(self, img): tensor = torch.FloatTensor(img).to(device) tensor = tensor.unsqueeze(0) tensor = tensor.unsqueeze(0) tensor = tensor.float() return tensor def convert_size(self, size_bytes): if size_bytes == 0: return "0B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = round(size_bytes / p, 2) return "%s %s" % (s, size_name[i]) def act(self, state): self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp( -1.0 * self.steps_done / self.eps_decay ) self.steps_done += 1 if random.random() > self.eps_threshold: # print("greedy") if torch.cuda.is_available(): action = np.argmax(self.policy(state).cpu().data.squeeze().numpy()) else: action = np.argmax(self.policy(state).data.squeeze().numpy()) else: action = random.randrange(0, 4) return int(action) def append_sample(self, state, action, reward, next_state): next_state = self.transformToTensor(next_state) current_q = self.policy(state).squeeze().cpu().detach().numpy()[action] next_q = self.target(next_state).squeeze().cpu().detach().numpy()[action] expected_q = reward + (self.gamma * next_q) error = abs(current_q - expected_q), self.memory.add(error, state, action, reward, next_state) def learn(self): if self.memory.tree.n_entries < self.batch_size: return states, actions, rewards, next_states, idxs, is_weights = self.memory.sample(self.batch_size) states = tuple(states) next_states = tuple(next_states) states = torch.cat(states) actions = np.asarray(actions) rewards = np.asarray(rewards) next_states = torch.cat(next_states) current_q = self.policy(states)[[range(0, self.batch_size)], [actions]] next_q =self.target(next_states).cpu().detach().numpy()[[range(0, self.batch_size)], [actions]] expected_q = torch.FloatTensor(rewards + (self.gamma * next_q)).to(device) errors = torch.abs(current_q.squeeze() - expected_q.squeeze()).cpu().detach().numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) loss = F.smooth_l1_loss(current_q.squeeze(), expected_q.squeeze()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def train(self): print("Starting...") score_history = [] reward_history = [] if self.episode == -1: self.episode = 1 for e in range(1, self.max_episodes + 1): start = time.time() state, _ = self.env.reset() steps = 0 score = 0 while True: state = self.transformToTensor(state) action = self.act(state) next_state, reward, done, _ = self.env.step(action) if steps == self.max_steps: done = 1 #self.memorize(state, action, reward, next_state) self.append_sample(state, action, reward, next_state) self.learn() state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") if self.memory.tree.n_entries < self.batch_size: print("Training will start after ", self.batch_size - self.memory.tree.n_entries, " steps.") break print( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) score_history.append(score) reward_history.append(reward) with open('log.txt', 'a') as file: file.write( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}\n".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) if torch.cuda.is_available(): print('Total Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory)) print('Allocated Memory:', self.convert_size(torch.cuda.memory_allocated(0))) print('Cached Memory:', self.convert_size(torch.cuda.memory_reserved(0))) print('Free Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory - ( torch.cuda.max_memory_allocated() + torch.cuda.max_memory_reserved()))) # tensorboard --logdir=runs memory_usage_allocated = np.float64(round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)) memory_usage_cached = np.float64(round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1)) writer.add_scalar("memory_usage_allocated", memory_usage_allocated, self.episode) writer.add_scalar("memory_usage_cached", memory_usage_cached, self.episode) writer.add_scalar('epsilon_value', self.eps_threshold, self.episode) writer.add_scalar('score_history', score, self.episode) writer.add_scalar('reward_history', reward, self.episode) writer.add_scalar('Total steps', self.steps_done, self.episode) writer.add_scalars('General Look', {'score_history': score, 'reward_history': reward}, self.episode) # save checkpoint if self.episode % self.save_interval == 0: checkpoint = { 'episode': self.episode, 'steps_done': self.steps_done, 'state_dict': self.policy.state_dict() } torch.save(checkpoint, self.save_dir + '//EPISODE{}.pt'.format(self.episode)) if self.episode % self.network_update_interval == 0: self.updateNetworks() self.episode += 1 end = time.time() stopWatch = end - start print("Episode is done, episode time: ", stopWatch) if self.episode % self.test_interval == 0: self.test() break writer.close() def test(self): self.test_network.load_state_dict(self.target.state_dict()) start = time.time() steps = 0 score = 0 image_array = [] state, next_state_image = self.env.reset() image_array.append(next_state_image) while True: state = self.transformToTensor(state) action = int(np.argmax(self.test_network(state).cpu().data.squeeze().numpy())) next_state, reward, done, next_state_image = self.env.step(action) image_array.append(next_state_image) if steps == self.max_steps: done = 1 state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") print("TEST, reward: {}, score: {}, total steps: {}".format( reward, score, self.steps_done)) with open('tests.txt', 'a') as file: file.write("TEST, reward: {}, score: {}, total steps: {}\n".format( reward, score, self.steps_done)) writer.add_scalars('Test', {'score': score, 'reward': reward}, self.episode) end = time.time() stopWatch = end - start print("Test is done, test time: ", stopWatch) # Convert images to video frameSize = (256, 144) import cv2 video = cv2.VideoWriter("videos\\test_video_episode_{}_score_{}.avi".format(self.episode, score), cv2.VideoWriter_fourcc(*'DIVX'), 7, frameSize) for img in image_array: video.write(img) video.release() break
class Agent: # todo change name to Agent def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample") def choose_action(self, state): state = torch.Tensor(state) A = self.policy.sortedA(state) if self.noisy: self.policy.predictNet.sample() return A[0] if np.random.random() < self.eps: return random.sample(A, 1)[0] return A[0] def sample(self): return self.replay.sample(self.batch_size) def store_demonstration(self, s, a, r, s_, done, episode): s = torch.Tensor(s) s_ = torch.Tensor(s_) episodeReplay = self.demoReplay[ episode] # replay of certain demo episode index = len(episodeReplay) data = (s, a, r, s_, done, (episode, index)) episodeReplay.append(data) self.replay.add(transition=data, demonstration=True) def store_transition(self, s, a, r, s_, done): s = torch.Tensor(s) s_ = torch.Tensor(s_) data = (s, a, r, s_, done, None) self.replay.add(transition=data, demonstration=False) def calculate_td_errors(self, samples): if self.noisy: self.policy.predictNet.sample() # for choosing action alls, alla, allr, alls_, alldone, *_ = zip(*samples) maxA = [self.policy.sortedA(s_)[0] for s_ in alls_] if self.noisy: self.policy.predictNet.sample() # for prediction self.policy.targetNet.sample() # for target Qtarget = torch.Tensor(allr) Qtarget[torch.tensor(alldone) != 1] += self.gamma * self.policy.calcQ( self.policy.targetNet, alls_, maxA)[torch.tensor(alldone) != 1] Qpredict = self.policy.calcQ(self.policy.predictNet, alls, alla) return Qpredict, Qtarget def JE(self, samples): loss = torch.tensor(0.0) count = 0 # number of demo for s, aE, *_, isdemo in samples: if isdemo is None: continue A = self.policy.sortedA(s) if len(A) == 1: continue QE = self.policy.calcQ(self.policy.predictNet, s, aE) A1, A2 = np.array(A)[: 2] # action with largest and second largest Q maxA = A2 if (A1 == aE).all() else A1 Q = self.policy.calcQ(self.policy.predictNet, s, maxA) if (Q + self.l_margin) < QE: continue else: loss += (Q - QE) count += 1 return loss / count if count != 0 else loss def Jn(self, samples, Qpredict): # wait for refactoring, can't use with noisy layer loss = torch.tensor(0.0) count = 0 for i, (s, a, r, s_, done, isdemo) in enumerate(samples): if isdemo is None: continue episode, idx = isdemo nidx = idx + self.n_steps lepoch = len(self.demoReplay[episode]) if nidx > lepoch: continue count += 1 ns, na, nr, ns_, ndone, _ = zip( *self.demoReplay[episode][idx:nidx]) ns, na, ns_, ndone = ns[-1], na[-1], ns_[-1], ndone[-1] discountedR = reduce( lambda x, y: (x[0] + self.gamma**x[1] * y, x[1] + 1), nr, (0, 0))[0] maxA = self.policy.sortedA(ns_)[0] target = discountedR if ndone else discountedR + self.gamma**self.n_steps * self.policy.calcQ( self.policy.targetNet, ns_, maxA) predict = Qpredict[i] loss += (target - predict)**2 return loss / count def L2(self, parameters): loss = 0 for p in parameters: loss += (p**2).sum() return loss def learn(self): self.opt.zero_grad() samples, idxs, = self.sample() Qpredict, Qtarget = self.calculate_td_errors(samples) for i in range(self.batch_size): error = math.fabs(float(Qpredict[i] - Qtarget[i])) self.replay.update(idxs[i], error) JDQ = self.loss(Qpredict, Qtarget) JE = self.JE(samples) Jn = self.Jn(samples, Qpredict) L2 = self.L2(self.policy.predictNet.parameters()) J = JDQ + self.lambda2 * JE + self.lambda1 * Jn + self.lambda3 * L2 J.backward() self.opt.step() self.counter += 1 if self.counter % self.tau == 0: self.policy.updateTargetNet()
action_str, rotate_idx = utils.get_action_info( pixel_index) old_value = trainer.forward( color, depth, action_str, False, rotate_idx, clear_grad=True)[0, pixel_index[1], pixel_index[2]] print "New Q value: {:03f} -> {:03f} | TD Target: {:03f}".format( old_q[i], old_value, td_target_list[i]) print "========================================================================================" #update_tree[i/5].update(idxs[i], td_target-old_value) if i / mini_batch_size == 0 or specific_tool == 0: suction_1_memory_buffer.update( idxs[i], td_target - old_value) elif i / mini_batch_size == 1 or specific_tool == 1: suction_2_memory_buffer.update( idxs[i], td_target - old_value) else: gripper_memory_buffer.update( idxs[i], td_target - old_value) back_t = time.time() - back_ts if arduino: arduino.write("b 1000") print "Backpropagation& Updating: {} seconds \t|\t Avg. {} seconds".format( back_t, back_t / (3 * mini_batch_size)) if learned_times % updating_freq == 0: print "[%f] Replace target network to behavior network" % ( program_time + time.time() - program_ts) trainer.target_net.load_state_dict( trainer.behavior_net.state_dict())