def main(): num_digits = 4 state_size = 128 embedding_dim = 8 model = StateModel(num_digits, state_size, embedding_dim) env = gym.make("GuessNumEnv-v0") episodes = 100 max_epside_len = 100 replay_memory = ReplayMemory(1000) for ep in range(episodes): state, reward, done = env.reset() state = torch.from_numpy(state) action = torch.argmax(model((state[:, :-2].unsqueeze(0).long(), state[:, -2:].unsqueeze(0).float())), dim=-1) + 1 # Plus one because the action is composed of the numbers between 1 and 9 next_state, reward, done = env.step(action.numpy().reshape(-1,)) t = Transition(state=state, next_state=next_state, reward=reward, action=action) env.render() print(reward, done) break
def learn(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.Tensor(batch.reward) state_action_values = self.policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(self.batch_size, device=device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def optimize_model(): if len(memory) < batch_size: return transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = policy_net(state_batch.float()).gather( 1, action_batch) next_state_values = torch.zeros(batch_size, device=device) next_state_values[non_final_mask] = target_net( non_final_next_states.float()).max(1)[0].detach() expected_state_action_values = (next_state_values * gama_discount) + reward_batch criterion = nn.SmoothL1Loss() loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize_dqn(bsz, opt_step): transitions = memory.sample(bsz) batch = Transition(*zip(*transitions)) non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states_t = torch.cat( tuple(s for s in batch.next_state if s is not None)).type(dtype) non_final_next_states = Variable(non_final_next_states_t, volatile=True) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) if USE_CUDA: state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() non_final_mask = non_final_mask.cuda() q_vals = policy_net(state_batch) state_action_values = q_vals.gather(1, action_batch.unsqueeze(0)) next_state_values = Variable(torch.zeros(bsz).cuda()) next_state_values[non_final_mask] = target_net( non_final_next_states).data.max(1)[0] expected_state_action_values = (next_state_values * args.gamma) + reward_batch q_loss = F.mse_loss(state_action_values, expected_state_action_values, size_average=False) loss = q_loss optimizer.zero_grad() loss.backward() optimizer.step()
def optimize_model(self): """ Train model. """ if len(self.memory) < self.batch_size: return 0.0 transitions = self.memory.sample(self.batch_size) # batch is ([state], [action], [next_state], [reward]) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device) non_final_next_states = torch.cat([ torch.tensor([s], dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor([s], dtype=torch.float) for s in batch.state]) action_batch = torch.cat( [torch.tensor([[s]], dtype=torch.long) for s in batch.action]) reward_batch = torch.cat( [torch.tensor([[s]], dtype=torch.float) for s in batch.reward]) q_eval = self.policy_net(state_batch).gather(1, action_batch) q_next = torch.zeros(self.batch_size, device=self.device) q_next[non_final_mask] = self.target_net(non_final_next_states).max( 1)[0].detach() q_target = (q_next * self.gamma) + reward_batch.squeeze() loss = F.mse_loss(q_eval, q_target.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def optimize_policy(replay_buffer, policy_net, target_net, optimizer, loss_function): """ This method optimizes the policy network by minimizing the TD error between the Q from the policy network and the Q calculated through a Bellman backup via the target network. """ global losses global eps_threshold if len(replay_buffer) < BATCH_SIZE: return transitions = replay_buffer.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # Manage edge cases non_final_mask = torch.tensor(tuple( map(lambda x: x is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.stack( [x for x in batch.next_state if x is not None]) # Create batch state_batch = torch.stack(batch.state) action_batch = torch.stack(batch.action) reward_batch = torch.stack(batch.reward) # Get Q value per policy network policy_net.train() state_action_values = policy_net(state_batch).gather(1, action_batch) # Get Q value per target_network next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states).max( 1).values expected_state_action_values = ( next_state_values.unsqueeze(1) * GAMMA) + reward_batch # value at terminal state is reward_batch # Compute loss loss = loss_function(state_action_values, expected_state_action_values) losses.append(loss.item()) # Optimize the policy network optimizer.zero_grad() loss.backward() clip_grad_norm_(policy_net.parameters(), 2.0) optimizer.step() eps_threshold = update_epsilon(eps_threshold) # Record output if RECORD: grad_norm = torch.stack( [params.grad.data.norm() for params in policy_net.parameters()]) writer.add_scalar('TD Loss', loss.item(), total_iterations) writer.add_scalar('Min Gradient Norm', grad_norm.min().item(), total_iterations) writer.add_scalar('Max Gradient Norm', grad_norm.max().item(), total_iterations) writer.add_scalar('Epsilon', eps_threshold, total_iterations)
def optimize_model(BATCH_SIZE, memory, device, policy_net, target_net, GAMMA, optimizer): # performs a single step of the optimization. It first samples a batch, # concatenates all the tensors into a single one, computes Q(st,at) and # V(st+1)=maxaQ(st+1,a), and combines them into our loss. By defition we # set V(s)=0 if s is a terminal state. We also use a target network to # compute V(st+1) for added stability. The target network has its weights # kept frozen most of the time, but is updated with the policy network’s # weights every so often. This is usually a set number of steps but we # shall use episodes for simplicity. if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), device=device, # dtype=torch.uint8, dtype=torch.bool, ) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = ( target_net(non_final_next_states).max(1)[0].detach()) # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8) non_final_mask = non_final_mask.type(torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Task 4: DONE: Compute the expected Q values expected_q_values = [ reward_batch[i].item() if batch.done[i] else reward_batch[i].item() + self.gamma * next_state_values[i].item() for i in range(len(batch.done)) ] # Array is converted to numpy expected_q_values = np.array(expected_q_values) expected_state_action_values = torch.tensor(expected_q_values, dtype=torch.float32) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step()
def learn(self): """ Learning function :return: """ if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) # avoid having an empty tensor test_tensor = T.zeros(self.batch_size) while T.all(T.eq(test_tensor, non_final_mask)).item() is True: transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = T.stack(non_final_next_states) state_batch = T.stack(batch.state) action_batch = T.cat(batch.action) reward_batch = T.cat(batch.reward) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = T.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute mse loss loss = F.mse_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.policy_net.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.policy_net.optimizer.step()
def optimize_model(optimizer, memory, model, model_target, batch_size, gamma, use_cuda): if len(memory) < batch_size: return transitions = memory.sample(batch_size) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = Variable(torch.stack( [s for s in batch.next_state if s is not None]), volatile=True) state_batch = Variable(torch.stack(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.stack(batch.reward)) if use_cuda: non_final_mask = non_final_mask.cuda() non_final_next_states = non_final_next_states.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable(torch.zeros(batch_size, 1).type(torch.Tensor)) if use_cuda: next_state_values = next_state_values.cuda() next_state_values[non_final_mask] = model_target( non_final_next_states).max(1)[0] # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # Compute Huber loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def real_batch(policy, env, batch_size): states, actions, next_states, masks, rewards = rollout(policy, env, batch_size) rewards = np.array([item for sublist in rewards for item in sublist]) batch = Transition(states, actions, masks, next_states, rewards) return batch
def optimize_model(batch_size, memory, policy_net, target_net, optimizer, GAMMA=0.999, device='cuda'): """Optimize the model for one step Return mini-batch loss """ if len(memory) < batch_size: return transitions = memory.sample(batch_size) # Transpose the batch. This converts batch-array of Transitions to Transition of batch-arrays batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken. # These are the actions which would've been taken for each batch state according to policy_net state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states # Expected values of actions for non_final_next_states are computed # This is merged based on the mask, such that we'll have either the expected state value or 0 # in case the state was final # DOUBLE DQN implementation: # . we use the online policy net to greedily select the action # . and the target net to estimate the Q-value next_state_values = torch.zeros(batch_size, device=device) next_action_policynet_decisions = policy_net(non_final_next_states).max(1)[1] non_final_next_state_targetnet_values = target_net(non_final_next_states) \ .gather(1, next_action_policynet_decisions.view(-1, 1).repeat(1, 2))[:, 0] next_state_values[non_final_mask] = non_final_next_state_targetnet_values.detach() # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() # Return minibatch huber loss return loss.item()
def ma_batch(policies, env, batch_size): states, actions, next_states, masks, rewards, avg_reward = ma_rollout( policies, env, batch_size) batches = [] for idx in range(len(states)): batches.append( Transition(np.array(states[idx]), np.array(actions[idx]), np.array(masks[idx]).reshape(-1), np.array(next_states[idx]), np.array(rewards[idx]).reshape(-1))) return batches
def update_model(self): if self.use_PER: batch_index, batch, ImportanceSamplingWeights = self.replay.sample( self.batch_size) else: batch = self.replay.sample(self.batch_size) batch_tuple = Transition(*zip(*batch)) state = torch.stack(batch_tuple.state) action = torch.stack(batch_tuple.action) reward = torch.stack(batch_tuple.reward) next_state = torch.stack(batch_tuple.next_state) done = torch.stack(batch_tuple.done) self.optimizer.zero_grad() if self.use_ICM: self.icm.optimizer.zero_grad() forward_loss = self.icm.get_forward_loss(state, action, next_state) inverse_loss = self.icm.get_inverse_loss(state, action, next_state) icm_loss = (1 - self.icm.beta) * inverse_loss.mean( ) + self.ICM.beta * forward_loss.mean() td_estimates = self.policy(state).gather(1, action).squeeze() td_targets = reward + (1 - done.float()) * self.gamma * \ self.target(next_state).max(1)[0].detach_() if self.use_PER: loss = (torch.tensor(ImportanceSamplingWeights, device=self.device) * self.loss_function(td_estimates, td_targets) ).sum() * self.loss_function(td_estimates, td_targets) errors = td_estimates - td_targets self.replay.batch_update(batch_index, errors.data.numpy()) else: loss = self.loss_function(td_estimates, td_targets) if self.use_ICM: loss = self.icm.lambda_weight * loss + icm_loss loss.backward() for param in self.policy.parameters(): param.grad.data.clamp_(-1, 1) if self.use_ICM: self.icm.optimizer.step() self.optimizer.step() return loss.item()
def add_transition(rep_buffer, ns_state, ns_action, ns_rew, ns_nexts, ns_done, current_state, empty_deque=False, ns=10, ns_gamma=0.99, is_done=True): ns_rew_sum = 0. trans = {} if empty_deque: # emptying the deques while len(ns_rew) > 0: for j in range(len(ns_rew)): ns_rew_sum += ns_rew[j] * ns_gamma**j # state,action,reward, # next_state,done, n_step_rew_sum, n_steps later # don't use done value because at this point the episode is done # trans['sample'] = [ns_state.popleft(), ns_action.popleft(), ns_rew.pop(0), # ns_nexts.popleft(), is_done, ns_rew_sum, current_state] trans = Transition(ns_state.popleft(), ns_action.popleft(), ns_nexts.popleft(), ns_rew.pop(0), ns_rew_sum) rep_buffer.add_sample(trans) else: for j in range(ns): ns_rew_sum += ns_rew[j] * ns_gamma**j # state,action,reward, # next_state,done, n_step_rew_sum, n_steps later # trans['sample'] = [ns_state.popleft(), ns_action.popleft(), ns_rew.pop(0), # ns_nexts.popleft(), ns_done.popleft(), ns_rew_sum, current_state] trans = Transition(ns_state.popleft(), ns_action.popleft(), ns_nexts.popleft(), ns_rew.pop(0), ns_rew_sum) rep_buffer.add_sample(trans)
def add_experience(self, state, action, reward, new_state, final): """ Add a SARS' tuple to the experience replay. :param source: source state :param action: action index :param reward: reward associated to the transition :param dest: destination state :param final: whether the state is absorbing """ # Remove older transitions if the replay memory is full if len(self.experiences) >= self.replay_memory_size: self.experiences.pop(0) # Add a tuple (source, action, reward, dest, final) to replay memory experience = Transition(state, action, reward, new_state, final) # print(f'add_experience: added {experience}') self.experiences.append(experience)
def add_transition(self, action, next_state, reward, done): if not done and self.index < self.nsteps: next_state = self.processor._observation(next_state) self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1))) transitions = [] gamma = 1 for trans in self.transitions: transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward)) gamma = gamma * GAMMA self.transitions = transitions else: for trans in self.transitions: self.memory.push(trans) self.transitions = [] self.state = next_state
def update_policy_net(self) -> None: """Update policy_net via Q-learning approximation""" # check if memory has enough elements to sample if len(self.memory) < self.batch_size: return # get transitions transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # get elements from batch non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to( torch.device(device)) non_final_mask = non_final_mask.type(torch.bool) non_final_next_obs = torch.stack([ ob for nonfinal, ob in zip(non_final_mask, batch.next_ob) if nonfinal ]).to(torch.device(device)) ob_batch = torch.stack(batch.ob).to(torch.device(device)) rew_batch = torch.stack(batch.rew).to(torch.device(device)) action_batch = torch.stack(batch.action).to(torch.device(device)) # estimate Q(st, a) with the policy network state_action_values = (self.policy_net.forward(ob_batch).gather( 1, action_batch).squeeze()) # estimate V(st+1) with target network next_state_values = torch.zeros(self.batch_size).to( torch.device(device)) next_state_values[non_final_mask] = ( self.target_net.forward(non_final_next_obs).max(1)[0].detach()) # expected Q value expected_state_action_values = (rew_batch.squeeze() + self.gamma * next_state_values) # loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # optimize the network self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-0.1, 0.1) self.optimizer.step()
def update(self, state, action, reward, next_state, terminal): self._episode_transitions.append( Transition(state, action, reward, next_state, terminal)) # Loop through the episode. # Compute discounted return from each state until the episode termination. # Use this computation to update both the actor and baseline. if terminal: discounted_return = 0 for transition in reversed(self._episode_transitions): discounted_return = self.DISCOUNT * discounted_return + transition.reward baseline = self._get_baseline(transition.state) td_error = discounted_return - baseline self._update_actor(transition.state, transition.action, td_error) self._update_baseline(transition.state, discounted_return)
def update(self, batch_size=16): if len(self.memory.memory) < batch_size: batch_size = len(self.memory.memory) transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = Variable(torch.zeros(batch_size).type(Tensor)) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0] expected_state_action_values = (next_state_values * self.gamma) + reward_batch expected_state_action_values = Variable( expected_state_action_values.data) loss = F.mse_loss(state_action_values, expected_state_action_values) old_params = freeze_as_np_dict(self.policy_net.state_dict()) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): logging.debug(param.grad.data.sum()) param.grad.data.clamp_(-1., 1.) self.optimizer.step() new_params = freeze_as_np_dict(self.policy_net.state_dict()) check_params_changed(old_params, new_params) return loss.data[0]
def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.get_state_act_vals(state_batch, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. state_action_values = state_action_values.view(-1, 1).repeat( 1, len(self.q_models)) next_state_values = self.get_max_next_state_vals( non_final_mask, non_final_next_states) expected_state_action_values = next_state_values + reward_batch.view( -1, 1).repeat(1, len(self.q_models)) loss = (state_action_values - expected_state_action_values)**2 coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models)) loss = torch.sum(loss * coefs) # loss = F.smooth_l1_loss(state_action_values.squeeze(), # expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def prime_buffer(self, env): """ Fill the n-step buffer each time the environment has been reset. """ # Maybe something is in there, clear it out. self.nstep_buffer = [] for step in range(self.config['n_steps']): action = self.online_network(self.state_transformer(self.state)) action = self.action_transformer(action) next_state, reward, done, info = env.step(action) trans = Transition(state=self.state, action=action, reward=reward, next_state=next_state, done=done, discounted_reward=0., nth_state=None, n=None) self.nstep_buffer.append(trans) self.state = next_state
def train(self): batch = self.memory.sample(min(BATCH_SIZE, len(self.memory))) b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))] states, actions, rewards, next_states, dones = \ b_dict[0], b_dict[1].view(-1, 1), \ b_dict[2].view(-1, 1).float().to(device), b_dict[3], \ b_dict[4].view(-1, 1).float().to(device) # CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a)) # inputs computation inputs_critic = self.qnet(states, actions) # targets with torch.no_grad(): policy_acts = self.policy_targ(next_states) targ_values = self.qnet_targ(next_states, policy_acts) targets_critics = rewards + GAMMA * (1 - dones) * targ_values loss_critic = self.MSE_loss(inputs_critic, targets_critics) self.q_optimizer.zero_grad() loss_critic.backward() # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP) self.q_optimizer.step() # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø actor_loss = -self.qnet(states, self.policy(states)).mean() self.p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP) self.p_optimizer.step() soft_update(self.policy_targ, self.policy, TAU) soft_update(self.qnet_targ, self.qnet, TAU) if self.args.use_writer: self.writer.add_scalar("critic_loss", loss_critic.item(), self.n_updates) self.writer.add_scalar("actor_loss", actor_loss.item(), self.n_updates) self.n_updates += 1
def train_agent_model_free(agent, env, params): update_timestep = params['update_every_n_steps'] seed = params['seed'] log_interval = 1000 gif_interval = 500000 n_random_actions = params['n_random_actions'] n_evals = params['n_evals'] n_collect_steps = params['n_collect_steps'] use_statefilter = params['obs_filter'] save_model = params['save_model'] assert n_collect_steps > agent.batchsize, "We must initially collect as many steps as the batch size!" avg_length = 0 time_step = 0 cumulative_timestep = 0 cumulative_log_timestep = 0 n_updates = 0 i_episode = 0 log_episode = 0 samples_number = 0 episode_rewards = [] episode_steps = [] if use_statefilter: state_filter = MeanStdevFilter(env.env.observation_space.shape[0]) else: state_filter = None random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.np_random.seed(seed) max_steps = env.spec.max_episode_steps writer = SummaryWriter() while samples_number < 3e7: time_step = 0 episode_reward = 0 i_episode += 1 log_episode += 1 state = env.reset() if state_filter: state_filter.update(state) done = False while (not done): cumulative_log_timestep += 1 cumulative_timestep += 1 time_step += 1 samples_number += 1 if samples_number < n_random_actions: action = env.action_space.sample() else: action = agent.get_action(state, state_filter=state_filter) nextstate, reward, done, _ = env.step(action) # if we hit the time-limit, it's not a 'real' done; we don't want to assign low value to those states real_done = False if time_step == max_steps else done agent.replay_pool.push( Transition(state, action, reward, nextstate, real_done)) state = nextstate if state_filter: state_filter.update(state) episode_reward += reward # update if it's time if cumulative_timestep % update_timestep == 0 and cumulative_timestep > n_collect_steps: q1_loss, q2_loss, pi_loss, a_loss = agent.optimize( update_timestep, state_filter=state_filter) n_updates += 1 # logging if cumulative_timestep % log_interval == 0 and cumulative_timestep > n_collect_steps: writer.add_scalar('Loss/Q-func_1', q1_loss, n_updates) writer.add_scalar('Loss/Q-func_2', q2_loss, n_updates) writer.add_scalar('Loss/policy', pi_loss, n_updates) writer.add_scalar('Loss/alpha', a_loss, n_updates) writer.add_scalar('Values/alpha', np.exp(agent.log_alpha.item()), n_updates) avg_length = np.mean(episode_steps) running_reward = np.mean(episode_rewards) eval_reward = evaluate_agent(env, agent, state_filter, n_starts=n_evals) writer.add_scalar('Reward/Train', running_reward, cumulative_timestep) writer.add_scalar('Reward/Test', eval_reward, cumulative_timestep) print( 'Episode {} \t Samples {} \t Avg length: {} \t Test reward: {} \t Train reward: {} \t Number of Policy Updates: {}' .format(i_episode, samples_number, avg_length, eval_reward, running_reward, n_updates)) episode_steps = [] episode_rewards = [] if cumulative_timestep % gif_interval == 0: make_gif(agent, env, cumulative_timestep, state_filter) if save_model: make_checkpoint(agent, cumulative_timestep, params['env']) episode_steps.append(time_step) episode_rewards.append(episode_reward)
empty_color = [] empty_depth = [] for i in range(m1.length): M1.add(m1.tree.data[i]) M2.add(m2.tree.data[i]) M3.add(m3.tree.data[i]) for i in range(m1.length): # Invalid point is common if m1.tree.data[i].reward == -3 * R: transition = m1.tree.data[i] pixel_index = transition.pixel_idx pixel_index[0] = 1 transition_2 = Transition(transition.color, transition.depth, pixel_index, transition.reward, transition.next_color, transition.next_depth, transition.is_empty) M2.add(transition_2) pixel_index[0] = np.random.choice(range(2, 6)) transition_3 = Transition(transition.color, transition.depth, pixel_index, transition.reward, transition.next_color, transition.next_depth, transition.is_empty) M3.add(transition_3) if m2.tree.data[i].reward == -3 * R: transition = m2.tree.data[i] pixel_index = transition.pixel_idx pixel_index[0] = 0 transition_1 = Transition(transition.color, transition.depth, pixel_index, transition.reward, transition.next_color, transition.next_depth,
def optimize_model(policy_net, target_net, replay_memory, optimizer, scheduler): if len(replay_memory) < config.BATCH_SIZE: return # print('Training...') policy_net.train() # print('Model mode:',policy_net.training) transitions = replay_memory.sample(config.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8) try: non_final_next_states = torch.stack( [s for s in batch.next_state if s is not None]) except: non_final_next_states = None state_batch = torch.stack(batch.state) action_batch = torch.stack(batch.action) reward_batch = torch.stack(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(config.BATCH_SIZE, device=device) if non_final_next_states is not None: next_state_values[non_final_mask] = target_net( non_final_next_states).max(1)[0].detach() # next_state_action = policy_net(non_final_next_states).max(1)[1].view(-1,1).detach() # next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_state_action) next_state_values = next_state_values.view(config.BATCH_SIZE, 1).float() # Compute the expected Q values expected_state_action_values = (next_state_values * config.GAMMA) + reward_batch.float() # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # Optimize the model # for param_group in optimizer.param_groups: # print(param_group['lr']) optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() scheduler.step() policy_net.eval() # print('Model mode:',policy_net.training) return
def main(): # Parse input parser = argparse.ArgumentParser(prog="exp_1", description="Code for Exp 1., testing the model capacity for grasping") parser.add_argument("model", type=str, help="model path for testing") parser.add_argument("type", type=str, help="novel/hybrid") parser.add_argument("run", type=int, help="Which number is this run") parser.add_argument("episode", type=int, help="Which episode is this run") parser.add_argument("--obj_nums", type=int, default=8, help="Number of object, default is 6") parser.add_argument("--port", type=str, default="/dev/ttylight", help="Port for arduino, which controls the alram lamp, default is /dev/ttylight") parser.add_argument("--densenet_lr", type=float, default=1e-5, help="Learning rate for feature extraction part, default is 1e-5") parser.add_argument("--primitive_lr", type=float, default=5e-5, help="Learning rate for motion primitive subnetworks, default is 1e-4") args = parser.parse_args() utils.show_args(args) # Create directories r = rospkg.RosPack() package_path = r.get_path("grasp_suck") root_path, image_path, depth_path, pc_path, vis_path, grasp_path, mixed_paths, feat_paths = create_directories(package_path, args.episode, args.run, args.type) arduino = serial.Serial(args.port, 115200) reward = 5.0 discount_factor = 0.5 return_ = 0.0 pick_item = 0 # Service clients vacuum_pump_control = rospy.ServiceProxy("/vacuum_pump_control_node/vacuum_control", SetBool) check_suck_success = rospy.ServiceProxy("/vacuum_pump_control_node/check_suck_success", SetBool) go_home = rospy.ServiceProxy("/agent_server_node/go_home", Empty) go_place = rospy.ServiceProxy("/agent_server_node/go_place", Empty) fixed_home = rospy.ServiceProxy("/agent_server_node/go_home_fix_orientation", Empty) publish_data_client = rospy.ServiceProxy("/agent_server_node/publish_data", publish_info) record_bag_client = rospy.ServiceProxy("/autonomous_recording_node/start_recording", recorder) stop_record_client = rospy.ServiceProxy("/autonomous_recording_node/stop_recording", Empty) # Shared data between processes work = mp.Value(c_bool, True) # Can prediction thread continue working? <bool> ready = mp.Value(c_bool, False) # Is prediction thread ready? <bool> can_predict = mp.Value(c_bool, False) # Can prediction thread do predict? <bool> should_reset = mp.Value(c_bool, False) # Should prediction thread reset model? <bool> iteration = mp.Value("i", 0) # What iteration is this action? <int> path_queue = mp.Queue() path_queue.put([image_path, depth_path, pc_path, vis_path, feat_paths, mixed_paths]) action_queue = mp.Queue() # Action placeholder, prediction thread will generate an action and main thread will consume it experience_queue = mp.Queue() # Transition placeholder, main thread will generate a transition and prediction thread will consume it # Start prediction thread p = mp.Process(target=prediction_process, args=(args, \ action_queue, experience_queue, \ work, ready, can_predict, should_reset, \ iteration, \ path_queue, )) p.start() # Initialize while not ready.value: pass go_home() vacuum_pump_control(SetBoolRequest(False)) is_empty = False cmd = raw_input("[Main Thread] Press any key to continue...") program_ts = time.time() can_predict.value = True while 1: action_target = [] is_empty_list = [] print("Code: {}".format(encode_index(args.episode, args.run))) record_bag_client(recorderRequest(encode_index(args.episode, args.run))) # Start recording while not is_empty and iteration.value<args.obj_nums*2: print("\033[1;32m[{}] Iteration: {}\033[0m".format(time.time()-program_ts, iteration.value)) arduino.write("b 1000") # Wait until there is action in the queue while action_queue.empty(): pass action_obj = action_queue.get() # [action, action_str, points, angle, pixel_index] is_valid = utils.check_if_valid(action_obj[2]) _viz(action_obj[2], action_obj[0], action_obj[3], is_valid) will_collide = None if is_valid: tool_id = (3-action_obj[0]) if action_obj[0] <2 else 1 if tool_id == 1: will_collide = _check_collide(action_obj[2], action_obj[3]) if not will_collide or tool_id!=1: _take_action(tool_id, action_obj[2], action_obj[3]) else: print("[Main Thread] Will collide, abort request!") else: arduino.write("r 1000") action_success = False if is_valid: if action_obj[0] < 2: action_success = check_suck_success().success else: if not will_collide: action_success = _check_grasp_success(iteration.value, grasp_path) else: action_success = False if action_success: pick_item += 1 info = publish_infoRequest(); info.execution = utils.wrap_execution_info(iteration.value, is_valid, action_obj[0], action_success); publish_data_client(info) empty_state = mp.Value(c_bool, False) iteration.value += 1 next_state_thread = mp.Process(target=get_next_state, args=(empty_state, iteration.value-1, (pick_item==args.obj_nums), pc_path, image_path, depth_path)) next_state_thread.start() if action_success: arduino.write("g 1000"); go_place(); fixed_home(); else: fixed_home(); vacuum_pump_control(SetBoolRequest(False)); current_reward = utils.reward_judgement(reward, is_valid, action_success) return_ += current_reward * np.power(discount_factor, iteration.value-1) print "\033[1;33mCurrent reward: {} \t Return: {}\033[0m".format(current_reward, return_) color_name, depth_name, next_color_name, next_depth_name = utils.wrap_strings(image_path, depth_path, iteration.value-1) next_state_thread.join(); is_empty = empty_state.value action_target.append(action_obj[4]); is_empty_list.append(is_empty) transition = Transition(color_name, depth_name, action_obj[4], current_reward, next_color_name, next_depth_name, is_empty) experience_queue.put(transition) if not is_empty and iteration.value < args.obj_nums*2: can_predict.value = True stop_record_client() if is_empty: print("\033[1;33m[{}] Pass test with return: {}\033[0m".format(time.time()-program_ts, return_)) else: print("\033[1;31m[{}] Failed with return: {}\033[0m".format(time.time()-program_ts, return_)) np.savetxt(root_path+"action_target.csv", action_target, delimiter=",") np.savetxt(root_path+"is_empty.csv", is_empty_list, delimiter=",") f = open(root_path+"{}.txt".format(encode_index(args.episode, args.run)), 'w') f.write("{}\n".format(is_empty)); f.write("{}".format(return_)); f.close() action_target = []; is_empty_list = [] cmd = raw_input("Press 'r' to reset, 'e' to exit: ") if cmd == 'e' or cmd == 'E': break elif cmd == 'r' or cmd == 'R': print("[Main Thread] Receive reset command") ready.value = False should_reset.value = True args.run += 1 root_path, image_path, depth_path, pc_path, vis_path, grasp_path, mixed_paths, feat_paths = create_directories(package_path, args.episode, args.run, args.type) path_queue.put([image_path, depth_path, pc_path, vis_path, feat_paths, mixed_paths]) is_empty = False pick_item = 0 return_ = 0.0 iteration.value = 0 # Wait until prediction thread ready while not ready.value: pass program_ts = time.time() can_predict.value = True # Tell prediction thread we can start # Stop prediction thread work.value = False p.join() print("Main thread stop")
def optimize_model(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) # print("trasitions:", transitions) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. # Batch is a name tuple, each field contains a list of batch size states. batch = Transition(*zip(*transitions)) # print("batch", batch) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool).to(device) #print("non_final_mask:",non_final_mask) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]).to(device) #print("non_final_next_states", non_final_next_states) #print("non_final_next_states shape", non_final_next_states.shape) # (batch_size, state_h, state_w) state_batch = torch.cat(batch.state).to(device) action_batch = torch.cat(batch.action).to(device) reward_batch = torch.cat(batch.reward).to(device) #print("reward batch:", reward_batch.shape) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net #print("state batch shape:",state_batch.shape) #print("action_batch:", action_batch) #print("unsqueeze:",action_batch.unsqueeze(1)) state_action_values = policy_net(state_batch).gather( 1, action_batch.unsqueeze(1)) #print("state action values:",state_action_values) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states).max( 1)[0].detach() #print("next_state_values:", next_state_values.shape) # Compute the expected Q values expected_state_action_values = (next_state_values.view(BATCH_SIZE, 1) * GAMMA) + reward_batch #print("state action values size:",state_action_values.shape) #print("expected state action values:",expected_state_action_values) #print("expected state action values size:", expected_state_action_values.unsqueeze(1)[:,:,0].shape) # Compute Huber loss loss = F.smooth_l1_loss( state_action_values.view(BATCH_SIZE, 1), expected_state_action_values.unsqueeze(1).view(BATCH_SIZE, 1).float()) #print("loss",loss) #input('press to continue') # Optimize the model optimizer.zero_grad() loss.backward() # for param in policy_net.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step()
def train(self): """ Train the online network using the n-step loss. """ env = self.env_builder() self.state = env.reset() self.prime_buffer(env) self.step = 0 score = 0 for epoch in range(self.config['n_epochs']): epoch_loss = [] start_time = time.time() for batch in range(self.config['n_batches_per_epoch']): scores = self.evaluator.evaluate(self.step) if scores is not None and self.batchsize_bandit is not None: if self.step > 0: reward = np.median(scores) self.batchsize_bandit.step(reward) batch_size, expert_batch_size = self.batchsize_bandit.sample( ) self.config['batch_size'] = batch_size self.config['expert_batch_size'] = expert_batch_size wandb.log({ "bandit_batch_size": batch_size, "bandit_expert_batch_size": expert_batch_size, "bandit_values": self.batchsize_bandit.values }) # Choose an action based on the current state and # according to an epsilon-greedy policy. epsilon = self.epsilon_schedule.value(self.step) if random.random() < epsilon: action = env.action_space.sample() else: action = self.action_transformer( self.online_network(self.state_transformer( self.state))) # Update the current state of the environment by taking # the action and building the current transition to be # added to the n-step buffer. These states are only added # to the replay buffer after a delay of n-steps. next_state, reward, done, info = env.step(action) current_trans = Transition(state=self.state, action=action, next_state=next_state, reward=reward, discounted_reward=None, nth_state=None, done=done, n=None) # Now use the contents of the n-step buffer to construct # the delayed transition and add that to the prioritized # replay buffer to be sampled for learning. (delayed_states, delayed_actions, delayed_rewards, delayed_next_states, delayed_discounted_rewards, delayed_nth_states, delayed_dones, delayed_ns) = expand_transitions(self.nstep_buffer, torchify=False) # Ensure that if the current episode has ended the last # few transitions get added correctly to the buffer. if not current_trans.done: delayed_trans = Transition( state=delayed_states[0], action=delayed_actions[0], reward=delayed_rewards[0], next_state=delayed_next_states[0], discounted_reward=np.sum([ reward * self.config['gamma']**i for i, reward in enumerate(delayed_rewards) ]), nth_state=self.state, done=done, n=self.config['n_steps']) self.buffer.add(delayed_trans) else: for i in range(self.config['n_steps']): delayed_trans = Transition( state=delayed_states[i], action=delayed_actions[i], reward=delayed_rewards[i], next_state=delayed_next_states[i], discounted_reward=np.sum([ reward * self.config['gamma']**j for j, reward in enumerate(delayed_rewards[i:]) ]), nth_state=self.state, done=done, n=self.config['n_steps'] - i) self.buffer.add(delayed_trans) # Now that we have used the buffer, we can add the current # transition to the queue. Update the current state of the # environment. self.nstep_buffer.append(current_trans) if len(self.nstep_buffer) > self.config['n_steps']: _ = self.nstep_buffer.pop(0) self.state = next_state beta = self.beta_schedule.value(self.step) if len(self.buffer) >= self.config[ 'batch_size'] and self.config['batch_size'] > 0: # Sample a batch of experience from the replay buffer and # train with the n-step TD loss. transitions, weights, indices = self.buffer.sample( self.config['batch_size'], beta) (states, actions, rewards, next_states, discounted_rewards, nth_states, dones, ns) = expand_transitions( transitions, torchify=True, state_transformer=self.state_transformer) # Calculate the loss per transition. This is not # aggregated so that we can make the importance sampling # correction to the loss. # # First we calculate the loss for 1-step ahead, then if # required, we look ahead n-steps and add that to our loss. # Importance sampling weights are based on the 1-step loss. loss = ntd_loss(online_model=self.online_network, target_model=self.target_network, states=states, actions=actions, next_states=next_states, rewards=rewards, dones=dones, gamma=0.99, n=1) weights = torch.FloatTensor(weights).to(self.device) loss = loss * weights priorities = loss + 1e-5 priorities = priorities.detach().cpu().numpy() self.buffer.update_priorities(priorities, indices) loss = loss.mean() if self.config['n_steps'] > 1: nstep_loss = ntd_loss(online_model=self.online_network, target_model=self.target_network, states=states, actions=actions, next_states=nth_states, rewards=discounted_rewards, dones=dones, gamma=0.99, n=ns) nstep_loss = nstep_loss.mean() loss += nstep_loss # Maybe we have an expert buffer, if so we should train some # samples from that expert buffer. if self.expert_buffer is not None and self.config[ 'expert_batch_size'] > 0: e_transitions, e_weights, e_indices = self.expert_buffer.sample( self.config['expert_batch_size'], beta) (e_states, e_actions, e_rewards, e_next_states, e_discounted_rewards, e_nth_states, e_dones, e_ns) = expand_transitions( e_transitions, torchify=True, state_transformer=self.state_transformer) e_loss = ntd_loss(online_model=self.online_network, target_model=self.target_network, states=e_states, actions=e_actions, next_states=e_next_states, rewards=e_rewards, dones=e_dones, gamma=0.99, n=1) e_weights = torch.FloatTensor(e_weights).to(self.device) e_loss = e_loss * e_weights e_priorities = e_loss + 1e-5 e_priorities = e_priorities.detach().cpu().numpy() self.expert_buffer.update_priorities( e_priorities, e_indices) e_loss = e_loss.mean() if self.config['n_steps'] > 1: e_nstep_loss = ntd_loss( online_model=self.online_network, target_model=self.target_network, states=e_states, actions=e_actions, next_states=e_nth_states, rewards=e_discounted_rewards, dones=e_dones, gamma=0.99, n=e_ns) e_nstep_loss = e_nstep_loss.mean() e_loss += e_nstep_loss q_values = self.online_network(e_states) e_loss += torch.mean(margin_loss(q_values, e_actions)) # Finally, add this sucker to the loss if we do have expert samples. if len(self.buffer) > self.config["batch_size"]: if self.config['batch_size'] > 0 and self.config[ 'expert_batch_size'] > 0: loss = loss * self.online_coef + e_loss * self.expert_coef elif self.config['batch_size'] > 0: loss = loss elif self.config['expert_batch_size'] > 0: loss = e_loss # Take the step of updating online network parameters # based on this batch loss. self.optimizer.zero_grad() loss.backward() self.optimizer.step() # End of training step actions epoch_loss.append(loss.detach().cpu().numpy()) # End of every step actions if self.step % self.config['update_interval'] == 0: self.target_network.load_state_dict( self.online_network.state_dict()) self.step += 1 score += current_trans.reward if current_trans.done: if score > self.best_episode: self.best_episode = score self.episodic_reward.append(score) score = 0 self.state = env.reset() self.prime_buffer(env) wandb.log({"episodic_reward": self.episodic_reward[-1]}) # End of batch actions self.loss.append(np.mean(epoch_loss)) print("Epoch {0}, Score {1:6.4f}, Loss {2:6.4f}, Time {3:6.4f}". format(epoch, score, self.loss[-1], time.time() - start_time)) wandb.log({ "time": time.time() - start_time, "loss": self.loss[-1], "epsilon": epsilon, "beta": beta }) wandb.log({"best_episode": self.best_episode})
def add_transition(self, state, action, next_state, reward, done, priority): trans = Transition(state, action, next_state, reward, done) self._buffer.push(item=trans, priority=priority)