def explore(self, actor_critic): """ Explore an environment by taking a sequence of actions and saving the results in the memory. Parameters ---------- actor_critic : ActorCritic The actor-critic model to use to explore. """ state = torch.FloatTensor(self.env.env.state) trajectory = [] for step in range(MAX_STEPS_BEFORE_UPDATE): action_probabilities, *_ = actor_critic(Variable(state)) action = action_probabilities.multinomial() action = action.data exploration_statistics = action_probabilities.data.view(1, -1) next_state, reward, done, _ = self.env.step(action.numpy()[0]) next_state = torch.from_numpy(next_state).float() if self.render: self.env.render() transition = replay_memory.Transition( states=state.view(1, -1), actions=action.view(1, -1), rewards=torch.FloatTensor([[reward]]), next_states=next_state.view(1, -1), done=torch.FloatTensor([[done]]), exploration_statistics=exploration_statistics) self.buffer.add(transition) trajectory.append(transition) if done: self.env.reset() break else: state = next_state return trajectory
def explore(self, actor_critic, noise_ratio=0.): """ Explore an environment by taking a sequence of actions and saving the results in the memory. Parameters ---------- actor_critic : ActorCritic The actor-critic model to use to explore. noise_ratio : float in [0, 1], optional What fraction of the action should be exploration noise? """ #state = torch.FloatTensor(self.env.env.state) if (self.m_state is None): self.m_state = self.env.reset() state = torch.FloatTensor(self.m_state) trajectory = [] for step in range(MAX_STEPS_BEFORE_UPDATE): policy_mean, *_ = actor_critic(Variable(state)) policy_logsd = actor_critic.policy_logsd action = torch.normal(policy_mean.data, torch.exp(policy_logsd.data)) noise_mean, noise_sd = self.noise.sampling_parameters() noise = torch.from_numpy(self.noise.sample()).float() action = noise_ratio * noise + (1. - noise_ratio) * action sampling_mean = noise_ratio * torch.from_numpy( noise_mean).float() + (1. - noise_ratio) * policy_mean.data sampling_logsd = 0.5 * torch.log( noise_ratio**2 * torch.from_numpy(noise_sd).float().pow(2) + (1. - noise_ratio)**2 * torch.exp(2 * policy_logsd.data)) exploration_statistics = torch.cat( [sampling_mean.view(1, -1), sampling_logsd.view(1, -1)], dim=1) scaled_action = float(self.env.action_space.low[0]) \ + float(self.env.action_space.high[0] - self.env.action_space.low[0]) * torch.sigmoid(action) next_state, reward, done, _ = self.env.step(scaled_action.numpy()) next_state = torch.from_numpy(next_state).float() if self.render: self.env.render() transition = replay_memory.Transition( states=state.view(1, -1), actions=action.view(1, -1), rewards=torch.FloatTensor([[reward]]), next_states=next_state.view(1, -1), done=torch.FloatTensor([[done]]), exploration_statistics=exploration_statistics) self.buffer.add(transition) trajectory.append(transition) if done: self.m_state = self.env.reset() break else: state = next_state return trajectory
def optimize_model(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = replay_memory.Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def optimize_model(self): if self.test_mode: print("Testing Mode") return if len(self.memory) < self.batch_size: print("Skipping:", len(self.memory)) return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = replay_memory.Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s.view(1, -1) for s in batch.next_state if s is not None], dim=0) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward).float() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net.forward(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).view(-1, self.n_actions).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute Huber loss loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1)) if self.loss_count == 0: self.loss_graph.append(loss) if len(self.rewards_cache) > 10: self.rewards_graph.append(np.mean(np.array( self.rewards_cache))) self.rewards_cache = [] self.saveGraph() self.saveModel('/home/alvin/Desktop/MRSD_ws/rl_model.pt') self.loss_count += 1 if self.loss_count == 20: self.loss_count = 0 if self.loss_count % self.target_update == 0: print("Update Target Network") self.updateTargetNetwork() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) print("Optimizing") self.optimizer.step()