def __call__(self, agent, selection_strategy, memory): observation = agent.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False # episode_memory = Memory(['action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty'], episode_memory = Memory(agent.train_loader.memory_cell_names, gamma=memory.gamma) with eval_mode(agent): while not terminate: step_counter += 1 agent.to(agent.device) action, certainty = selection_strategy( agent, observation.to(agent.device)) new_observation, reward, terminate, _ = agent.env.step(action) episode_reward += torch.sum( reward).item() / agent.env.n_instances episode_memory.memorize( (action, observation, torch.tensor(reward).float(), new_observation, terminate, certainty.detach()), [ 'action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty' ]) observation = new_observation[~terminate.view(-1)] terminate = terminate.min().item() memory.memorize(episode_memory, episode_memory.memory_cell_names) agent.train_dict['rewards'] = agent.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > agent.train_dict.get('best_performance', -np.inf): agent.train_dict['best_performance'] = episode_reward return episode_reward
def play_episode(self): observation = self.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False episode_memory = Memory( ['action', 'state', 'reward', 'new_state', 'terminal'], gamma=self.gamma) with eval_mode(self): while not terminate: step_counter += 1 with torch.no_grad(): action = self.chose_action(self, observation) new_observation, reward, terminate, _ = self.env.step(action) episode_reward += reward episode_memory.memorize( (action, observation, torch.tensor(reward).float(), new_observation, terminate), ['action', 'state', 'reward', 'new_state', 'terminal']) observation = new_observation self.train_loader.memorize(episode_memory, episode_memory.memory_cell_names) self.train_dict['rewards'] = self.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > self.train_dict.get('best_performance', -np.inf): self.train_dict['best_performance'] = episode_reward return episode_reward
def forward(self, model): with eval_mode(model): model.to(model.device) loss = [] accuracies = [] for data, y in self.data_loader: data = data.to(model.device) y = y.to(model.device) y_pred = model.model(data) loss += [model.crit(y_pred, y)] y_pred = y_pred.max(dim=1)[1] accuracies += [(y_pred == y).float()] loss = torch.stack(loss).mean().item() model.train_dict['val_losses'] = model.train_dict.get('val_losses', []) + [loss] model.train_dict['val_epochs'] = model.train_dict.get('val_epochs', []) + [model.train_dict['epochs_run']] accuracy = torch.cat(accuracies).mean().item() model.train_dict['val_accuracy'] = model.train_dict.get('val_accuracy', []) + [accuracy] if loss < model.train_dict.get('best_val_performance', np.inf): model.train_dict['best_train_performance'] = loss model.train_dict['epochs_since_last_val_improvement'] = 0 if self.verbose == 1: print('val loss: {:.4f} - val accuracy: {:.4f}'.format(loss, accuracy)) return loss
def forward(self, model): if model.train_dict['epochs_run'] % self.frequency == 0: print('Evaluation environment...', flush=True) with eval_mode(model): episode_rewards = [] for _ in tqdm(range(self.n_evaluations)): terminate = False episode_reward = 0 observation = self.env.reset().detach() while not terminate: action = self.action_selector(model, observation) new_observation, reward, terminate, _ = self.env.step( action) episode_reward += torch.sum( torch.tensor(reward)).item() / self.env.n_instances observation = new_observation terminate = terminate.min().item() episode_rewards += [episode_reward] print( f'Evaluation reward for {model.name}: {np.mean(episode_rewards):.2f}', flush=True) for name, func in self.metrics.items(): model.train_dict[name] = model.train_dict.get( name, []) + [func(episode_rewards)] model.train_dict[self.epoch_name] = model.train_dict.get( self.epoch_name, []) + [model.train_dict['epochs_run']]
def __call__(self, agent, observation): pipeline = Pipeline(pipes=self.pre_pipes + [agent] + self.post_pipes) with torch.no_grad(): # @todo why am I using a no grad here? This policy can then not be used for training? with eval_mode(agent): y_mean, y_std = pipeline(observation) shape = y_mean.shape dist = MultivariateNormal(loc=y_mean.view(-1), covariance_matrix=y_std.view(-1) ** 2 * torch.eye(len(y_std.view(-1)))) return dist.sample().view(shape)
def forward(self, model): print('Visualizing environment...') with eval_mode(model): terminate = False episode_reward = 0 observation = self.env.reset().detach() while not terminate: action = self.action_selector(model, observation) new_observation, reward, done, _ = self.env.step(action) episode_reward += torch.sum( torch.tensor(reward)).item() / self.env.n_instances observation = new_observation terminate = done self.env.render() print(f'Visual evaluation reward for model: {episode_reward:.2f}')
def play_episode(self): state_old = self.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False episode_memory = Memory([ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ], gamma=self.gamma) action_old = None while not terminate: with eval_mode(self): action = self.chose_action(self, state_old) state, reward, terminate, _ = self.env.step(action) episode_reward += reward if step_counter > 0: episode_memory.memorize( (action_old, state_old, torch.tensor(reward_old).float(), state, action, False), [ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ]) state_old = state reward_old = reward action_old = action step_counter += 1 # memorize final step episode_memory.memorize( (action_old, state_old, torch.tensor(reward_old).float(), state, action, True), [ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ]) self.train_loader.memorize(episode_memory, episode_memory.memory_cell_names) self.train_dict['rewards'] = self.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > self.train_dict.get('best_performance', -np.inf): self.train_dict['best_performance'] = episode_reward return episode_reward
def fit_epoch(self, device, verbose=1): self.model.train() self.model.to(device) self.target_model.to(device) for batch, (action, state, reward, next_state, next_action, terminal) in tqdm(enumerate(self.train_loader)): action, state, reward, next_state = action.to( self.device), state.to(self.device), reward.to( self.device), next_state.to(self.device) prediction = self.model(state.squeeze(1)) next_action = one_hot_encoding(next_action).to(self.device) with eval_mode(self): # @todo this is not working with DDQN so far next_Q = (self.target_model(next_state.squeeze(1)) * next_action).sum(1) target = prediction.clone().detach() mask = one_hot_encoding(action).type(torch.BoolTensor) target[mask] = (1 - self.alpha) * target[mask] + self.alpha * ( reward + self.gamma * next_Q * (1. - terminal.type(torch.FloatTensor)).to(self.device)) loss = self.crit(prediction, target) self.train_dict['train_losses'] += [loss.item()] self._backward(loss) self.update_target_network() if verbose == 1: print( f'epoch: {self.train_dict["epochs_run"]}\t' f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t' f'latest average reward: {self.train_dict["avg_reward"][-1]:.2f}' ) return loss
def get_max_Q_for_states(self, states): with eval_mode( self): # @todo we might have trouble with the MC Dropout here max_Q = self.target_model(states.squeeze(1)).max(dim=1)[0] return max_Q
def get_max_Q_for_states(self, states): with eval_mode( self): # @todo we might have trouble with the MC Dropout here value, advantage = self.model(states.squeeze(1)) max_Q = (value + advantage).max(dim=1)[0] return max_Q