class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = Memory(capacity=buffer_size, replay_beta=REPLAY_BETA, replay_alpha=REPLAY_ALPHA, replay_beta_increment=REPLAY_BETA_INCREMENT) self.batch_size = batch_size self.seed = random.seed(seed) self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" if len(self.memory) <= self.batch_size: error = random.random() else: error = self.memory.max_prio e = self.experience(state, action, reward, next_state, done) self.memory.add(error, e) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, ws = self.memory.sample(n=self.batch_size) states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones), idxs, ws def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
class Agent(object): def __init__(self): self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet( ACTIONS_SIZE) self.memory = Memory(MEMORY_SIZE) self.learning_count = 0 self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) self.loss_func = nn.MSELoss() def action(self, state, israndom): if israndom and random.random() < EPSILON: return np.random.randint(0, ACTIONS_SIZE) state = torch.unsqueeze(torch.FloatTensor(state), 0) actions_value = self.network.forward(state) return torch.max(actions_value, 1)[1].data.numpy()[0] def learn(self, state, action, reward, next_state, done): old_val = self.network.forward(torch.FloatTensor([state])).gather( 1, torch.LongTensor([[action]]))[0] target_val = self.network.forward(torch.FloatTensor([state])) if done: done = 0 target = reward else: done = 1 target = reward + GAMMA * torch.max(target_val) error = abs(old_val[0] - target) self.memory.add(error.data, (state, action, reward, next_state, done)) if self.memory.tree.n_entries < MEMORY_THRESHOLD: return if self.learning_count % UPDATE_TIME == 0: self.target_network.load_state_dict(self.network.state_dict()) self.learning_count += 1 batch, idxs, is_weights = self.memory.sample(BATCH_SIZE) state = torch.FloatTensor([x[0] for x in batch]) action = torch.LongTensor([[x[1]] for x in batch]) reward = torch.FloatTensor([[x[2]] for x in batch]) next_state = torch.FloatTensor([x[3] for x in batch]) done = torch.FloatTensor([[x[4]] for x in batch]) eval_q = self.network.forward(state).gather(1, action) next_q = self.target_network(next_state).detach() target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done errors = torch.abs(eval_q - target_q).data.numpy().flatten() loss = self.loss_func(eval_q, target_q) for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class MAD4PG: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e6), batch_size=64, gamma=0.99, tau=1e-3, update_every=3, num_mc_steps=5, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = tau self.UPDATE_EVERY = update_every self.num_mc_steps = num_mc_steps self.experiences = [ ExperienceQueue(num_mc_steps) for _ in range(num_agents) ] self.memory = Memory(buffer_size) self.t_step = 0 self.train_start = batch_size self.mad4pg_agent = [ D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax), D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax) ] def acts(self, states, add_noise=0.0): acts = [] for s, a in zip(states, self.mad4pg_agent): acts.append(a.act(np.expand_dims(s, 0), add_noise)) return np.vstack(acts) # borrow from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter14 def distr_projection(self, next_distr_v, rewards_v, dones_mask_t, gamma): next_distr = next_distr_v.data.cpu().numpy() rewards = rewards_v.data.cpu().numpy() dones_mask = dones_mask_t.cpu().numpy().astype(np.bool) batch_size = len(rewards) proj_distr = np.zeros((batch_size, N_ATOMS), dtype=np.float32) dones_mask = np.squeeze(dones_mask) rewards = rewards.reshape(-1) for atom in range(N_ATOMS): tz_j = np.minimum( Vmax, np.maximum(Vmin, rewards + (Vmin + atom * DELTA_Z) * gamma)) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] ne_mask = u != l proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] if dones_mask.any(): proj_distr[dones_mask] = 0.0 tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones_mask])) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l if dones_mask.shape == (): if dones_mask: proj_distr[0, l] = 1.0 else: ne_mask = u != l proj_distr[0, l] = (u - b_j)[ne_mask] proj_distr[0, u] = (b_j - l)[ne_mask] else: eq_dones = dones_mask.copy() eq_dones[dones_mask] = eq_mask if eq_dones.any(): proj_distr[eq_dones, l[eq_mask]] = 1.0 ne_mask = u != l ne_dones = dones_mask.copy() ne_dones[dones_mask] = ne_mask if ne_dones.any(): proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] return torch.FloatTensor(proj_distr).to(device) def step(self, states, actions, rewards, next_states, dones): for agent_index in range(len(self.mad4pg_agent)): agent_experiences = self.experiences[agent_index] agent_experiences.states.appendleft(states[agent_index]) agent_experiences.rewards.appendleft(rewards[agent_index] * self.GAMMA**self.num_mc_steps) agent_experiences.actions.appendleft(actions[agent_index]) if len(agent_experiences.rewards) == self.num_mc_steps or dones[ agent_index]: # N-steps return: r= r1+gamma*r2+..+gamma^(t-1)*rt done_tensor = torch.tensor( dones[agent_index]).float().to(device) condition = True while condition: for i in range(len(agent_experiences.rewards)): agent_experiences.rewards[i] /= self.GAMMA state = torch.tensor( agent_experiences.states[-1]).float().unsqueeze(0).to( device) next_state = torch.tensor( next_states[agent_index]).float().unsqueeze(0).to( device) action = torch.tensor( agent_experiences.actions[-1]).float().unsqueeze(0).to( device) sum_reward = torch.tensor(sum( agent_experiences.rewards)).float().unsqueeze(0).to( device) with evaluating( self.mad4pg_agent[agent_index]) as cur_agent: q_logits_expected = cur_agent.critic_local( state, action) action_next = cur_agent.actor_target(next_state) q_target_logits_next = cur_agent.critic_target( next_state, action_next) q_target_distr_next = F.softmax(q_target_logits_next, dim=1) q_target_distr_next_projected = self.distr_projection( q_target_distr_next, sum_reward, done_tensor, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax( q_logits_expected, dim=1) * q_target_distr_next_projected error = cross_entropy.sum(dim=1).mean().cpu().data self.memory.add( error, (states[agent_index], actions[agent_index], sum_reward, next_states[agent_index], dones[agent_index])) agent_experiences.states.pop() agent_experiences.rewards.pop() agent_experiences.actions.pop() condition = False and dones[agent_index] and len( agent_experiences.states) > 0 if dones[agent_index]: agent_experiences.states.clear() agent_experiences.rewards.clear() agent_experiences.actions.clear() self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(self.memory.tree.n_entries) if self.memory.tree.n_entries > self.train_start: for agent_index in range(len(self.mad4pg_agent)): sampled_experiences, idxs = self.sample() self.learn(self.mad4pg_agent[agent_index], sampled_experiences, idxs) def sample(self): # prioritized experience replay mini_batch, idxs, is_weights = self.memory.sample(self.BATCH_SIZE) mini_batch = np.array(mini_batch).transpose() statess = np.vstack([m for m in mini_batch[0] if m is not None]) actionss = np.vstack([m for m in mini_batch[1] if m is not None]) rewardss = np.vstack([m for m in mini_batch[2] if m is not None]) next_statess = np.vstack([m for m in mini_batch[3] if m is not None]) doness = np.vstack([m for m in mini_batch[4] if m is not None]) # bool to binary doness = doness.astype(int) statess = torch.from_numpy(statess).float().to(device) actionss = torch.from_numpy(actionss).float().to(device) rewardss = torch.from_numpy(rewardss).float().to(device) next_statess = torch.from_numpy(next_statess).float().to(device) doness = torch.from_numpy(doness).float().to(device) return (statess, actionss, rewardss, next_statess, doness), idxs def learn(self, agent, experiences, idxs): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Compute critic loss q_logits_expected = agent.critic_local(states, actions) actions_next = agent.actor_target(next_states) q_targets_logits_next = agent.critic_target(next_states, actions_next) q_targets_distr_next = F.softmax(q_targets_logits_next, dim=1) q_targets_distr_projected_next = self.distr_projection( q_targets_distr_next, rewards, dones, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax(q_logits_expected, dim=1) * q_targets_distr_projected_next critic_loss = cross_entropy.sum(dim=1).mean() with torch.no_grad(): errors = cross_entropy.sum(dim=1).cpu().data.numpy() # update priority for i in range(self.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() agent.critic_optimizer.step() # Compute actor loss actions_pred = agent.actor_local(states) crt_distr_v = agent.critic_local(states, actions_pred) actor_loss = -agent.critic_local.distr_to_q(crt_distr_v) actor_loss = actor_loss.mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, self.TAU) agent.soft_update(agent.actor_local, agent.actor_target, self.TAU)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.memory = Memory( capacity=self.buffer_size) # internal memory using SumTree self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # Learn, if enough samples are available in memory if self.memory.tree.n_entries >= batch_size: experiences, idxs, is_weights = self.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: #action = [act + self.noise.sample() for act in action] action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights, batch_size=BATCH_SIZE, gamma=GAMMA): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #Loss calculation critic_loss = (torch.from_numpy(is_weights).float().to(device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #Introducing gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def add(self, state, action, reward, next_state, done, gamma=GAMMA): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(device) reward_torch = torch.unsqueeze( torch.from_numpy(np.array(reward)).float().to(device), 1) done_torch = torch.unsqueeze( torch.from_numpy(np.array(done).astype( np.uint8)).float().to(device), 1) state_torch = torch.from_numpy(state).float().to(device) action_torch = torch.from_numpy(action).float().to(device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer for i in np.arange(len(reward)): self.memory.add( error[i], (state[i], action[i], reward[i], next_state[i], done[i])) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, is_weights = self.memory.sample(self.batch_size) states = np.vstack([e[0] for e in experiences]) states = torch.from_numpy(states).float().to(device) actions = np.vstack([e[1] for e in experiences]) actions = torch.from_numpy(actions).float().to(device) rewards = np.vstack([e[2] for e in experiences]) rewards = torch.from_numpy(rewards).float().to(device) next_states = np.vstack([e[3] for e in experiences]) next_states = torch.from_numpy(next_states).float().to(device) dones = np.vstack([e[4] for e in experiences]).astype(np.uint8) dones = torch.from_numpy(dones).float().to(device) return (states, actions, rewards, next_states, dones), idxs, is_weights
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): state = torch.from_numpy(state).float().unsqueeze(0).to(device) next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() self.qnetwork_target.eval() with torch.no_grad(): target_action_values = self.qnetwork_target(next_state) expected_action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.qnetwork_target.train() old_val = expected_action_values[0][action] new_val = reward if not done: new_val += GAMMA * torch.max(target_action_values) error = abs(old_val - new_val) # Save experience in replay memory self.memory.add(error, (state, action, reward, next_state, done)) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ mini_batches, idxs, is_weights = experiences states = torch.from_numpy(np.vstack([mini_batch[0] for mini_batch in mini_batches])).float().to(device) actions = torch.from_numpy(np.vstack([mini_batch[1] for mini_batch in mini_batches])).long().to(device) rewards = torch.from_numpy(np.vstack([mini_batch[2] for mini_batch in mini_batches])).float().to(device) next_states = torch.from_numpy(np.vstack([mini_batch[3] for mini_batch in mini_batches])).float().to(device) dones = torch.from_numpy(np.vstack([int(mini_batch[4]) for mini_batch in mini_batches])).float().to(device) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" Q_source_next = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_target = self.qnetwork_target(next_states) Q_double_target = torch.tensor([Q_target[i][max_index] for i, max_index in enumerate(Q_source_next)]).detach().unsqueeze(1) Q_observed = rewards + (gamma * Q_double_target * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) errors = torch.abs(Q_expected - Q_observed).data.numpy() # update priority for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_observed)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQNAgent(): def __init__(self, state_size, action_size): self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.discount_factor = 0.99 self.learning_rate = 0.001 self.lr_step_size = 10 self.lr_gamma = 0.9 self.memory_size = 2**15 self.epsilon = 1.0 self.epsilon_min = 0.05 self.explore_step = 1000 self.epsilon_decay = 0.99995 self.batch_size = 64 self.train_start = 10000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.scheduler = StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_gamma) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/per_dqn') self.model.train() # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform_(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state).float() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(torch.tensor(state).float()).data old_val = target[0][action] target_val = self.target_model(torch.tensor(next_state).float()).data if done: target[0][action] = reward else: target[0][action] = reward + \ self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.tensor(states).float() pred = self.model(states) # one-hot encoding a = torch.tensor(actions, dtype=torch.long).view(-1, 1) one_hot_action = torch.zeros(self.batch_size, self.action_size) one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(one_hot_action), dim=1) # Q function of next state next_states = torch.tensor(next_states, dtype=torch.float) next_pred = self.target_model(next_states.float()).data rewards = torch.tensor(rewards, dtype=torch.float) dones = torch.tensor(dones, dtype=torch.float) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * \ self.discount_factor * next_pred.max(1)[0] errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.tensor(is_weights).float() * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step() return loss.item()
class Dqn(): def __init__(self): self.eval_net, self.target_net = Net(), Net() self.eval_net.cuda() self.target_net.cuda() # create prioritized replay memory using SumTree self.memory = Memory(Train_Configs.MEMORY_CAPACITY) self.learn_counter = 0 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=Train_Configs.LR,betas=(0.9, 0.99), eps=1e-08, weight_decay=2e-5) self.loss = nn.MSELoss(reduce=False, size_average=False) self.fig, self.ax = plt.subplots() self.discount_factor = Train_Configs.GAMMA def store_trans(self, state_path, action, reward, next_state_path,done): ## action type: id x, y, c = my_utils.translate_actionID_to_XY_and_channel(action) trans = state_path+'#'+str(action)+'#'+str(reward)+'#'+next_state_path#np.hstack((state, [action], [reward], next_state)) #------ calculate TD errors from (s,a,r,s'), #--only from the first depth image, without considering other 9 rotated depth images state_d = state_path next_state_d = next_state_path if c > 0: state_d = my_utils.get_rotate_depth(c,state_d) next_state_d = my_utils.get_rotate_depth(c, next_state_d) state_depth = my_utils.copy_depth_to_3_channel(state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_depth = my_utils.copy_depth_to_3_channel(next_state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) if c == 0: state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(next_state_path.replace('npy','png').replace('state_depth', 'state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) else: state_rgb = my_utils.get_rotate_rgb(c,state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.get_rotate_rgb(c,next_state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # next_state_depth = (next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_depth = torch.cuda.FloatTensor(state_depth) next_state_depth = torch.cuda.FloatTensor(next_state_depth) state_rgb = torch.cuda.FloatTensor(state_rgb) next_state_rgb = torch.cuda.FloatTensor(next_state_rgb) target_singleChannel_q_map = self.eval_net.forward(state_rgb,state_depth)#dim:[1,1,224,224],CHANNEL=1 # x,y,c = my_utils.translate_actionID_to_XY_and_channel(action) old_val = target_singleChannel_q_map[0][0][x][y] # old_val = target[0][action] target_val_singleChannel_q_map = self.target_net.forward(next_state_rgb,next_state_depth)#dim:[1,1,224,224] if done == 1: target_q = reward # target[0][action] = reward else: target_q = reward + self.discount_factor * torch.max(target_val_singleChannel_q_map) # target[0][action] = reward + self.discount_factor * torch.max(target_val) error = abs(old_val - target_q) self.memory.add(float(error), trans) def choose_action(self, state_path,EPSILON): state_rgb = [] state_depth = [] state_rgb.append(my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image')))) state_depth.append(my_utils.copy_depth_to_3_channel(state_path))#dim:[3, DIM_STATES[0], DIM_STATES[1]]#.reshape(1, 3, DIM_STATES[0], DIM_STATES[1])) for i in range(1,Train_Configs.ROTATION_BINS): state_rotate_rgb = my_utils.get_rotate_rgb(i,state_path.replace('npy','png').replace('state_depth','state_image')) state_rgb.append(state_rotate_rgb) #------------------------ state_rotate_depth = my_utils.get_rotate_depth(i,state_path) state_rotate_3_depth = my_utils.copy_depth_to_3_channel(state_rotate_depth) state_depth.append(state_rotate_3_depth) state_rgb = np.array(state_rgb) state_depth = np.array(state_depth) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_rgb = torch.cuda.FloatTensor(state_rgb) # dim:[INPUT_IMAGE,3,224,224] state_depth = torch.cuda.FloatTensor(state_depth) #dim:[INPUT_IMAGE,3,224,224] # random exploration prob = np.min((EPSILON,1)) p_select = np.array([prob, 1 - prob]) selected_ac_type = np.random.choice([0, 1], p=p_select.ravel()) if selected_ac_type == 0:#origin predicted action target_multiChannel_q_map = self.eval_net.forward(state_rgb,state_depth) # dim:[INPUT_IMAGES,1,224,224] action = my_utils.find_maxQ_in_qmap(target_multiChannel_q_map.cpu().detach().numpy()) ac_ty = '0' else: if np.random.randn() <= 0.5:#sample action according to depth image action = my_utils.select_randpID_from_mask(state_path) ac_ty = '1' else:# random sample action = np.random.randint(0,DIM_ACTIONS) ac_ty = '2' return ac_ty,action # the id of action def plot(self, ax, x): ax.cla() ax.set_xlabel("episode") ax.set_ylabel("total reward") ax.plot(x, 'b-') plt.pause(0.000000000000001) def load_batch_data(self,batch_list):#batch_list.dim:[batch_size] # print(batch_list) batch_state_rgb = [] batch_state_depth = [] batch_action = [] batch_reward = [] batch_next_state_rgb = [] batch_next_state_depth = [] for item in batch_list: data = item.split('#')#state+'#'+str(action)+'#'+str(reward)+'#'+next_state action_id = int(data[1]) batch_state_rgb.append(my_utils.get_rotate_rgb(action_id,data[0].replace('npy','png').replace('state_depth','state_image'))) batch_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[0])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_action.append([int(data[1])]) batch_reward.append([float(data[2])]) batch_next_state_rgb.append(my_utils.get_rotate_rgb(action_id, data[3].replace('npy','png').replace('state_depth', 'state_image'))) batch_next_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[3])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_state_depth = np.array(batch_state_depth) batch_next_state_depth = np.array(batch_next_state_depth) # # normlize # batch_state_depth = (batch_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # batch_next_state_depth = (batch_next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) return torch.cuda.FloatTensor(batch_state_rgb),torch.cuda.FloatTensor(batch_state_depth),torch.cuda.LongTensor(batch_action),torch.cuda.FloatTensor(batch_reward),torch.cuda.FloatTensor(batch_next_state_rgb),torch.cuda.FloatTensor(batch_next_state_depth) def learn(self): # learn 100 times then the target network update if self.learn_counter % Train_Configs.Q_NETWORK_ITERATION ==0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_counter+=1 mini_batch, idxs, is_weights = self.memory.sample(Train_Configs.BATCH_SIZE)# batch_state_rgb,batch_state_depth,batch_action,batch_reward,batch_next_state_rgb,batch_next_state_depth = self.load_batch_data(mini_batch)#dim:[1] eval_singleChannel_q_map = self.eval_net(batch_state_rgb,batch_state_depth) # dim:[BATCH_SIZE,1,224,224] x_y_c_list = my_utils.translate_actionID_to_XY_and_channel_batch(batch_action) # old_val = target_multiChannel_q_map[0][c][x][y] batch_q = [] # for xyc in x_y_c_list: for i in range(len(x_y_c_list)): xyc = x_y_c_list[i] batch_q.append([eval_singleChannel_q_map[i][0][xyc[0]][xyc[1]]]) q_eval = torch.cuda.FloatTensor(batch_q)#self.eval_net(batch_state).gather(1, batch_action)#action: a value in range [0,DIM_ACTIONS-1] q_eval = Variable(q_eval.cuda(), requires_grad=True) target_singleChannel_q_map = self.target_net(batch_next_state_rgb,batch_next_state_depth).cpu().detach().numpy()#q_next,dim:[BATCH_SIZE,1,224,224] batch_q_next = [] for b_item in target_singleChannel_q_map:#dim:[1,224,224] batch_q_next.append([np.max(b_item)]) q_next = torch.cuda.FloatTensor(batch_q_next) # q_next = Variable(q_next.cuda(), requires_grad=True) q_target = batch_reward + Train_Configs.GAMMA*q_next q_target = Variable(q_target.cuda(), requires_grad=True) # self.average_q = q_eval.mean() weight_tensor = torch.cuda.FloatTensor(is_weights)# weight_tensor = weight_tensor.reshape((Train_Configs.BATCH_SIZE,1)) weight_tensor = Variable(weight_tensor.cuda(), requires_grad=False) loss = (weight_tensor * self.loss(q_eval, q_target)).mean()##(torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return float(loss),float(q_eval.mean())
m1 = Memory(1000) m2 = Memory(1000) m3 = Memory(1000) M1 = Memory(1500) M2 = Memory(1500) M3 = Memory(1500) m1.load_memory("../training/logger_013/suction_1_memory.pkl") m2.load_memory("../training/logger_013/suction_2_memory.pkl") m3.load_memory("../training/logger_013/gripper_memory.pkl") empty_color = [] empty_depth = [] for i in range(m1.length): M1.add(m1.tree.data[i]) M2.add(m2.tree.data[i]) M3.add(m3.tree.data[i]) for i in range(m1.length): # Invalid point is common if m1.tree.data[i].reward == -3 * R: transition = m1.tree.data[i] pixel_index = transition.pixel_idx pixel_index[0] = 1 transition_2 = Transition(transition.color, transition.depth, pixel_index, transition.reward, transition.next_color, transition.next_depth, transition.is_empty) M2.add(transition_2) pixel_index[0] = np.random.choice(range(2, 6))
class IRAgent_FourRooms(OffPolicyAgent_FourRooms): # construct agent's model separately, so it can be sized according to problem def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type='BC'): super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type) # reseed numpy, reset weights of network # Reset must be performed before every episode def reset(self, seed=0): # Reset time self.t = 0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.env.size[0] * self.env.size[1], 1) # instead of generating one episode of experience, take 16 steps of experience def generate_experience(self, k=16): # Initialize environment s = self.env.reset() done = False steps = 0 # counting to k steps while steps < k: # choose action according to policy a = np.random.choice(a=self.actions, p=self.behavior_policy[s[0], s[1]]) # Take a step in environment based on chosen action (s2, r, done, _) = self.env.step(a) # Compute importance ratios ratio = self.target_policy[s[0], s[1], a] / self.behavior_policy[s[0], s[1], a] # Add experience to IR replay buffer self.replay_buffer.add(ratio, (s, a, r, s2)) # Set for next step s = s2 self.t += 1 steps += 1 # If episode ends, reset environment if done: s = self.env.reset() done = False # do batch of training using replay buffer def train_batch(self, batch_size): # Sample a minibatch from replay buffer data_samples, _, _, buffer_total = self.replay_buffer.sample( batch_size) # Extract rewards, states, next states from samples rewards = extract_transition_components(data_samples, TransitionComponent.reward) next_states = extract_transition_components( data_samples, TransitionComponent.next_state) next_state_features = self.construct_features(next_states) states = extract_transition_components(data_samples, TransitionComponent.state) state_features = self.construct_features(states) # Importance ratios for update equation - IR does not use this ratios = np.ones(len(states)) # In case of Bias Correction, pre-multiply bias corrector to update if self.name == "BC": ratios = ratios * (buffer_total / self.replay_buffer.tree.n_entries) # Get value estimate for next state next_values = self.model.predict( [next_state_features, np.zeros(next_state_features.shape[0])]).flatten() # v(s') is zero for terminal state, so need to fix model prediction for i in range(batch_size): # if experience ends in terminal state then s==s2 if (states[i] == next_states[i]).all(): next_values[i] = 0.0 # Compute targets by bootstrap estimates targets = (rewards + self.discount * next_values) # Train on samples self.model.fit([state_features, ratios], targets, batch_size=batch_size, verbose=0)
class DDQN_Agent: def __init__(self, useDepth=False): self.useDepth = useDepth self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 30000 self.gamma = 0.8 self.learning_rate = 0.001 self.batch_size = 512 self.memory = Memory(10000) self.max_episodes = 10000 self.save_interval = 2 self.test_interval = 10 self.network_update_interval = 10 self.episode = -1 self.steps_done = 0 self.max_steps = 34 self.policy = DQN() self.target = DQN() self.test_network = DQN() self.target.eval() self.test_network.eval() self.updateNetworks() self.env = DroneEnv(useDepth) self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) if torch.cuda.is_available(): print('Using device:', device) print(torch.cuda.get_device_name(0)) else: print("Using CPU") # LOGGING cwd = os.getcwd() self.save_dir = os.path.join(cwd, "saved models") if not os.path.exists(self.save_dir): os.mkdir("saved models") if not os.path.exists(os.path.join(cwd, "videos")): os.mkdir("videos") if torch.cuda.is_available(): self.policy = self.policy.to(device) # to use GPU self.target = self.target.to(device) # to use GPU self.test_network = self.test_network.to(device) # to use GPU # model backup files = glob.glob(self.save_dir + '\\*.pt') if len(files) > 0: files.sort(key=os.path.getmtime) file = files[-1] checkpoint = torch.load(file) self.policy.load_state_dict(checkpoint['state_dict']) self.episode = checkpoint['episode'] self.steps_done = checkpoint['steps_done'] self.updateNetworks() print("Saved parameters loaded" "\nModel: ", file, "\nSteps done: ", self.steps_done, "\nEpisode: ", self.episode) else: if os.path.exists("log.txt"): open('log.txt', 'w').close() if os.path.exists("last_episode.txt"): open('last_episode.txt', 'w').close() if os.path.exists("last_episode.txt"): open('saved_model_params.txt', 'w').close() self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) obs, _ = self.env.reset() tensor = self.transformToTensor(obs) writer.add_graph(self.policy, tensor) def updateNetworks(self): self.target.load_state_dict(self.policy.state_dict()) def transformToTensor(self, img): tensor = torch.FloatTensor(img).to(device) tensor = tensor.unsqueeze(0) tensor = tensor.unsqueeze(0) tensor = tensor.float() return tensor def convert_size(self, size_bytes): if size_bytes == 0: return "0B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = round(size_bytes / p, 2) return "%s %s" % (s, size_name[i]) def act(self, state): self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp( -1.0 * self.steps_done / self.eps_decay ) self.steps_done += 1 if random.random() > self.eps_threshold: # print("greedy") if torch.cuda.is_available(): action = np.argmax(self.policy(state).cpu().data.squeeze().numpy()) else: action = np.argmax(self.policy(state).data.squeeze().numpy()) else: action = random.randrange(0, 4) return int(action) def append_sample(self, state, action, reward, next_state): next_state = self.transformToTensor(next_state) current_q = self.policy(state).squeeze().cpu().detach().numpy()[action] next_q = self.target(next_state).squeeze().cpu().detach().numpy()[action] expected_q = reward + (self.gamma * next_q) error = abs(current_q - expected_q), self.memory.add(error, state, action, reward, next_state) def learn(self): if self.memory.tree.n_entries < self.batch_size: return states, actions, rewards, next_states, idxs, is_weights = self.memory.sample(self.batch_size) states = tuple(states) next_states = tuple(next_states) states = torch.cat(states) actions = np.asarray(actions) rewards = np.asarray(rewards) next_states = torch.cat(next_states) current_q = self.policy(states)[[range(0, self.batch_size)], [actions]] next_q =self.target(next_states).cpu().detach().numpy()[[range(0, self.batch_size)], [actions]] expected_q = torch.FloatTensor(rewards + (self.gamma * next_q)).to(device) errors = torch.abs(current_q.squeeze() - expected_q.squeeze()).cpu().detach().numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) loss = F.smooth_l1_loss(current_q.squeeze(), expected_q.squeeze()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def train(self): print("Starting...") score_history = [] reward_history = [] if self.episode == -1: self.episode = 1 for e in range(1, self.max_episodes + 1): start = time.time() state, _ = self.env.reset() steps = 0 score = 0 while True: state = self.transformToTensor(state) action = self.act(state) next_state, reward, done, _ = self.env.step(action) if steps == self.max_steps: done = 1 #self.memorize(state, action, reward, next_state) self.append_sample(state, action, reward, next_state) self.learn() state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") if self.memory.tree.n_entries < self.batch_size: print("Training will start after ", self.batch_size - self.memory.tree.n_entries, " steps.") break print( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) score_history.append(score) reward_history.append(reward) with open('log.txt', 'a') as file: file.write( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}\n".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) if torch.cuda.is_available(): print('Total Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory)) print('Allocated Memory:', self.convert_size(torch.cuda.memory_allocated(0))) print('Cached Memory:', self.convert_size(torch.cuda.memory_reserved(0))) print('Free Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory - ( torch.cuda.max_memory_allocated() + torch.cuda.max_memory_reserved()))) # tensorboard --logdir=runs memory_usage_allocated = np.float64(round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)) memory_usage_cached = np.float64(round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1)) writer.add_scalar("memory_usage_allocated", memory_usage_allocated, self.episode) writer.add_scalar("memory_usage_cached", memory_usage_cached, self.episode) writer.add_scalar('epsilon_value', self.eps_threshold, self.episode) writer.add_scalar('score_history', score, self.episode) writer.add_scalar('reward_history', reward, self.episode) writer.add_scalar('Total steps', self.steps_done, self.episode) writer.add_scalars('General Look', {'score_history': score, 'reward_history': reward}, self.episode) # save checkpoint if self.episode % self.save_interval == 0: checkpoint = { 'episode': self.episode, 'steps_done': self.steps_done, 'state_dict': self.policy.state_dict() } torch.save(checkpoint, self.save_dir + '//EPISODE{}.pt'.format(self.episode)) if self.episode % self.network_update_interval == 0: self.updateNetworks() self.episode += 1 end = time.time() stopWatch = end - start print("Episode is done, episode time: ", stopWatch) if self.episode % self.test_interval == 0: self.test() break writer.close() def test(self): self.test_network.load_state_dict(self.target.state_dict()) start = time.time() steps = 0 score = 0 image_array = [] state, next_state_image = self.env.reset() image_array.append(next_state_image) while True: state = self.transformToTensor(state) action = int(np.argmax(self.test_network(state).cpu().data.squeeze().numpy())) next_state, reward, done, next_state_image = self.env.step(action) image_array.append(next_state_image) if steps == self.max_steps: done = 1 state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") print("TEST, reward: {}, score: {}, total steps: {}".format( reward, score, self.steps_done)) with open('tests.txt', 'a') as file: file.write("TEST, reward: {}, score: {}, total steps: {}\n".format( reward, score, self.steps_done)) writer.add_scalars('Test', {'score': score, 'reward': reward}, self.episode) end = time.time() stopWatch = end - start print("Test is done, test time: ", stopWatch) # Convert images to video frameSize = (256, 144) import cv2 video = cv2.VideoWriter("videos\\test_video_episode_{}_score_{}.avi".format(self.episode, score), cv2.VideoWriter_fourcc(*'DIVX'), 7, frameSize) for img in image_array: video.write(img) video.release() break
class Agent: """ Interacts with and learns from the environment. Learns using a Deep Q-Network with prioritised experience replay. Two models are instantiated, one for use during evaluation and updating (qnetwork_local) and one to be used for the target values in the learning algorithm (qnetwork_target) """ BUFFER_SIZE = int(1e5) # prioritised experience replay buffer size BATCH_SIZE = 64 # minibatch size TAU = 1e-3 # for soft update of target parameters LR = 5e-4 # learning rate UPDATE_EVERY = 4 # how often to update the network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def __init__(self, state_size: int = 37, action_size: int = 4, seed: int = 44, gamma: float = 0.99, tau: float = 1e-3): """ Initialize an Agent object. :param state_size: dimension of each state :param action_size: dimension of each action :param seed: random seed for network initialisation :param gamma: discount factor :param tau: lag for soft update of target network parameters """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.tau = tau self.max_w = 0 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) # Prioritised Experience Replay memory self.memory = Memory(self.BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, gamma: Optional[float] = None, tau: Optional[float] = None): """ An agent step takes the current experience and stores it in the replay memory, then samples from the memory and calls the learning algorithm. :param state: the state vector :param action: the action performed on the state :param reward: the reward given upon performing the action :param next_state: the next state after doing the action :param done: True if the episode has ended :param gamma: discount factor :param tau: lag for soft update of target network parameters """ gamma_value = gamma if gamma is not None else self.gamma tau_value = tau if tau is not None else self.tau self.memory.add((state, action, reward, next_state, done)) # Save experience in replay memory # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > self.BATCH_SIZE: experiences, idxs, importance_weights = self.memory.sample( self.BATCH_SIZE) self.learn(experiences, idxs, importance_weights, gamma_value, tau_value) def act(self, state: np.ndarray, eps: float = 0.0): """ Returns actions for given state as per current policy. Uses the local copy of the model. :param state: current state :param eps: epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.int32(np.argmax(action_values.cpu().data.numpy())) else: return np.int32(random.choice(np.arange(self.action_size))) def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], indices: np.ndarray, importance_weights: torch.Tensor, gamma: float, tau: float): """ Update value parameters using given batch of experience tuples. :param experiences: tuple of (s, a, r, s', done) tuples :param indices: indices of the SumTree that contain the priority values for these experiences. Used for updating the priority values after error has been found :param importance_weights: the weighting that each experience carries when used in updating the network :param gamma: discount factor :param tau: lag for soft update of target network parameters """ states, actions, rewards, next_states, dones = experiences # For Double-DQN, get action with the highest q-value (for next_states) from the local model next_action = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).gather( 1, next_action) # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) error = torch.abs(q_targets - q_expected).detach().numpy() # update priorities self.memory.batch_update(indices, error) # Compute mse and loss with importance weights t_mse = F.mse_loss(q_expected, q_targets) loss = (importance_weights * t_mse).mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network with model parameters approaching those of the local network. self.soft_update(self.qnetwork_local, self.qnetwork_target, tau) @staticmethod def soft_update(local_model: torch.nn.Module, target_model: torch.nn.Module, tau: float): """ Soft update model parameters. Every learning step the target network is updated to bring its parameters nearer by a factor TAU to those of the improving local network. If TAU = 1 the target network becomes a copy of the local network. If TAU = 0 the target network is not updated. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: weights will be copied from :param target_model: weights will be copied to :param tau: interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: # todo change name to Agent def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample") def choose_action(self, state): state = torch.Tensor(state) A = self.policy.sortedA(state) if self.noisy: self.policy.predictNet.sample() return A[0] if np.random.random() < self.eps: return random.sample(A, 1)[0] return A[0] def sample(self): return self.replay.sample(self.batch_size) def store_demonstration(self, s, a, r, s_, done, episode): s = torch.Tensor(s) s_ = torch.Tensor(s_) episodeReplay = self.demoReplay[ episode] # replay of certain demo episode index = len(episodeReplay) data = (s, a, r, s_, done, (episode, index)) episodeReplay.append(data) self.replay.add(transition=data, demonstration=True) def store_transition(self, s, a, r, s_, done): s = torch.Tensor(s) s_ = torch.Tensor(s_) data = (s, a, r, s_, done, None) self.replay.add(transition=data, demonstration=False) def calculate_td_errors(self, samples): if self.noisy: self.policy.predictNet.sample() # for choosing action alls, alla, allr, alls_, alldone, *_ = zip(*samples) maxA = [self.policy.sortedA(s_)[0] for s_ in alls_] if self.noisy: self.policy.predictNet.sample() # for prediction self.policy.targetNet.sample() # for target Qtarget = torch.Tensor(allr) Qtarget[torch.tensor(alldone) != 1] += self.gamma * self.policy.calcQ( self.policy.targetNet, alls_, maxA)[torch.tensor(alldone) != 1] Qpredict = self.policy.calcQ(self.policy.predictNet, alls, alla) return Qpredict, Qtarget def JE(self, samples): loss = torch.tensor(0.0) count = 0 # number of demo for s, aE, *_, isdemo in samples: if isdemo is None: continue A = self.policy.sortedA(s) if len(A) == 1: continue QE = self.policy.calcQ(self.policy.predictNet, s, aE) A1, A2 = np.array(A)[: 2] # action with largest and second largest Q maxA = A2 if (A1 == aE).all() else A1 Q = self.policy.calcQ(self.policy.predictNet, s, maxA) if (Q + self.l_margin) < QE: continue else: loss += (Q - QE) count += 1 return loss / count if count != 0 else loss def Jn(self, samples, Qpredict): # wait for refactoring, can't use with noisy layer loss = torch.tensor(0.0) count = 0 for i, (s, a, r, s_, done, isdemo) in enumerate(samples): if isdemo is None: continue episode, idx = isdemo nidx = idx + self.n_steps lepoch = len(self.demoReplay[episode]) if nidx > lepoch: continue count += 1 ns, na, nr, ns_, ndone, _ = zip( *self.demoReplay[episode][idx:nidx]) ns, na, ns_, ndone = ns[-1], na[-1], ns_[-1], ndone[-1] discountedR = reduce( lambda x, y: (x[0] + self.gamma**x[1] * y, x[1] + 1), nr, (0, 0))[0] maxA = self.policy.sortedA(ns_)[0] target = discountedR if ndone else discountedR + self.gamma**self.n_steps * self.policy.calcQ( self.policy.targetNet, ns_, maxA) predict = Qpredict[i] loss += (target - predict)**2 return loss / count def L2(self, parameters): loss = 0 for p in parameters: loss += (p**2).sum() return loss def learn(self): self.opt.zero_grad() samples, idxs, = self.sample() Qpredict, Qtarget = self.calculate_td_errors(samples) for i in range(self.batch_size): error = math.fabs(float(Qpredict[i] - Qtarget[i])) self.replay.update(idxs[i], error) JDQ = self.loss(Qpredict, Qtarget) JE = self.JE(samples) Jn = self.Jn(samples, Qpredict) L2 = self.L2(self.policy.predictNet.parameters()) J = JDQ + self.lambda2 * JE + self.lambda1 * Jn + self.lambda3 * L2 J.backward() self.opt.step() self.counter += 1 if self.counter % self.tau == 0: self.policy.updateTargetNet()
next_pc.pc, image_path + "next_", depth_path + "next_", iteration) is_empty = _check_if_empty(next_pc.pc) current_reward = utils.reward_judgement( reward, is_valid, action_success) return_ += current_reward * np.power(discount_factor, t) print "\033[1;33mCurrent reward: {} \t Return: {}\033[0m".format( current_reward, return_) # Store transition to experience buffer color_name, depth_name, next_color_name, next_depth_name = utils.wrap_strings( image_path, depth_path, iteration) transition = Transition(color_name, depth_name, pixel_index, current_reward, next_color_name, next_depth_name, is_empty) if pixel_index[0] == 0: suction_1_memory_buffer.add(transition) elif pixel_index[0] == 1: suction_2_memory_buffer.add(transition) else: gripper_memory_buffer.add(transition) print "Suction_1_Buffer: {} | Suction_2_Buffer: {} | Gripper_Buffer: {}".format( suction_1_memory_buffer.length, suction_2_memory_buffer.length, gripper_memory_buffer.length) iteration += 1 t += 1 ################################TRAIN################################ # Start training after buffer has sufficient experiences if suction_1_memory_buffer.length > mini_batch_size and \ suction_2_memory_buffer.length > mini_batch_size and \ gripper_memory_buffer.length > mini_batch_size: