示例#1
0
    def __call__(self, agent, selection_strategy, memory):
        observation = agent.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False

        # episode_memory = Memory(['action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty'],
        episode_memory = Memory(agent.train_loader.memory_cell_names,
                                gamma=memory.gamma)
        with eval_mode(agent):
            while not terminate:
                step_counter += 1
                agent.to(agent.device)
                action, certainty = selection_strategy(
                    agent, observation.to(agent.device))
                new_observation, reward, terminate, _ = agent.env.step(action)

                episode_reward += torch.sum(
                    reward).item() / agent.env.n_instances
                episode_memory.memorize(
                    (action, observation, torch.tensor(reward).float(),
                     new_observation, terminate, certainty.detach()), [
                         'action', 'state', 'reward', 'new_state', 'terminal',
                         'uncertainty'
                     ])
                observation = new_observation[~terminate.view(-1)]
                terminate = terminate.min().item()
        memory.memorize(episode_memory, episode_memory.memory_cell_names)
        agent.train_dict['rewards'] = agent.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > agent.train_dict.get('best_performance', -np.inf):
            agent.train_dict['best_performance'] = episode_reward

        return episode_reward
示例#2
0
    def play_episode(self):
        observation = self.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False
        episode_memory = Memory(
            ['action', 'state', 'reward', 'new_state', 'terminal'],
            gamma=self.gamma)
        with eval_mode(self):
            while not terminate:
                step_counter += 1
                with torch.no_grad():
                    action = self.chose_action(self, observation)
                new_observation, reward, terminate, _ = self.env.step(action)

                episode_reward += reward
                episode_memory.memorize(
                    (action, observation, torch.tensor(reward).float(),
                     new_observation, terminate),
                    ['action', 'state', 'reward', 'new_state', 'terminal'])
                observation = new_observation

        self.train_loader.memorize(episode_memory,
                                   episode_memory.memory_cell_names)
        self.train_dict['rewards'] = self.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > self.train_dict.get('best_performance', -np.inf):
            self.train_dict['best_performance'] = episode_reward

        return episode_reward
示例#3
0
    def forward(self, model):
        with eval_mode(model):
            model.to(model.device)
            loss = []
            accuracies = []
            for data, y in self.data_loader:
                data = data.to(model.device)
                y = y.to(model.device)
                y_pred = model.model(data)
                loss += [model.crit(y_pred, y)]

                y_pred = y_pred.max(dim=1)[1]
                accuracies += [(y_pred == y).float()]

            loss = torch.stack(loss).mean().item()
            model.train_dict['val_losses'] = model.train_dict.get('val_losses', []) + [loss]
            model.train_dict['val_epochs'] = model.train_dict.get('val_epochs', []) + [model.train_dict['epochs_run']]
            accuracy = torch.cat(accuracies).mean().item()
            model.train_dict['val_accuracy'] = model.train_dict.get('val_accuracy', []) + [accuracy]

        if loss < model.train_dict.get('best_val_performance', np.inf):
            model.train_dict['best_train_performance'] = loss
            model.train_dict['epochs_since_last_val_improvement'] = 0

        if self.verbose == 1:
            print('val loss: {:.4f} - val accuracy: {:.4f}'.format(loss, accuracy))
        return loss
示例#4
0
    def forward(self, model):
        if model.train_dict['epochs_run'] % self.frequency == 0:
            print('Evaluation environment...', flush=True)
            with eval_mode(model):
                episode_rewards = []
                for _ in tqdm(range(self.n_evaluations)):
                    terminate = False
                    episode_reward = 0
                    observation = self.env.reset().detach()
                    while not terminate:
                        action = self.action_selector(model, observation)
                        new_observation, reward, terminate, _ = self.env.step(
                            action)
                        episode_reward += torch.sum(
                            torch.tensor(reward)).item() / self.env.n_instances
                        observation = new_observation
                        terminate = terminate.min().item()
                    episode_rewards += [episode_reward]

            print(
                f'Evaluation reward for {model.name}: {np.mean(episode_rewards):.2f}',
                flush=True)
            for name, func in self.metrics.items():
                model.train_dict[name] = model.train_dict.get(
                    name, []) + [func(episode_rewards)]
            model.train_dict[self.epoch_name] = model.train_dict.get(
                self.epoch_name, []) + [model.train_dict['epochs_run']]
示例#5
0
 def __call__(self, agent, observation):
     pipeline = Pipeline(pipes=self.pre_pipes + [agent] + self.post_pipes)
     with torch.no_grad():  # @todo why am I using a no grad here? This policy can then not be used for training?
         with eval_mode(agent):
             y_mean, y_std = pipeline(observation)
     shape = y_mean.shape
     dist = MultivariateNormal(loc=y_mean.view(-1),
                               covariance_matrix=y_std.view(-1) ** 2 * torch.eye(len(y_std.view(-1))))
     return dist.sample().view(shape)
示例#6
0
    def forward(self, model):
        print('Visualizing environment...')
        with eval_mode(model):
            terminate = False
            episode_reward = 0
            observation = self.env.reset().detach()
            while not terminate:
                action = self.action_selector(model, observation)
                new_observation, reward, done, _ = self.env.step(action)
                episode_reward += torch.sum(
                    torch.tensor(reward)).item() / self.env.n_instances
                observation = new_observation
                terminate = done
                self.env.render()

        print(f'Visual evaluation reward for model: {episode_reward:.2f}')
示例#7
0
    def play_episode(self):
        state_old = self.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False
        episode_memory = Memory([
            'action', 'state', 'reward', 'new_state', 'new_action', 'terminal'
        ],
                                gamma=self.gamma)

        action_old = None

        while not terminate:
            with eval_mode(self):
                action = self.chose_action(self, state_old)
            state, reward, terminate, _ = self.env.step(action)

            episode_reward += reward
            if step_counter > 0:
                episode_memory.memorize(
                    (action_old, state_old, torch.tensor(reward_old).float(),
                     state, action, False), [
                         'action', 'state', 'reward', 'new_state',
                         'new_action', 'terminal'
                     ])
            state_old = state
            reward_old = reward
            action_old = action
            step_counter += 1

        # memorize final step
        episode_memory.memorize(
            (action_old, state_old, torch.tensor(reward_old).float(), state,
             action, True), [
                 'action', 'state', 'reward', 'new_state', 'new_action',
                 'terminal'
             ])

        self.train_loader.memorize(episode_memory,
                                   episode_memory.memory_cell_names)
        self.train_dict['rewards'] = self.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > self.train_dict.get('best_performance', -np.inf):
            self.train_dict['best_performance'] = episode_reward

        return episode_reward
示例#8
0
    def fit_epoch(self, device, verbose=1):
        self.model.train()
        self.model.to(device)
        self.target_model.to(device)

        for batch, (action, state, reward, next_state, next_action,
                    terminal) in tqdm(enumerate(self.train_loader)):
            action, state, reward, next_state = action.to(
                self.device), state.to(self.device), reward.to(
                    self.device), next_state.to(self.device)
            prediction = self.model(state.squeeze(1))
            next_action = one_hot_encoding(next_action).to(self.device)
            with eval_mode(self):  # @todo this is not working with DDQN so far
                next_Q = (self.target_model(next_state.squeeze(1)) *
                          next_action).sum(1)
            target = prediction.clone().detach()

            mask = one_hot_encoding(action).type(torch.BoolTensor)
            target[mask] = (1 - self.alpha) * target[mask] + self.alpha * (
                reward + self.gamma * next_Q *
                (1. - terminal.type(torch.FloatTensor)).to(self.device))

            loss = self.crit(prediction, target)
            self.train_dict['train_losses'] += [loss.item()]
            self._backward(loss)

        self.update_target_network()

        if verbose == 1:
            print(
                f'epoch: {self.train_dict["epochs_run"]}\t'
                f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t'
                f'latest average reward: {self.train_dict["avg_reward"][-1]:.2f}'
            )

        return loss
示例#9
0
 def get_max_Q_for_states(self, states):
     with eval_mode(
             self):  # @todo we might have trouble with the MC Dropout here
         max_Q = self.target_model(states.squeeze(1)).max(dim=1)[0]
     return max_Q
示例#10
0
 def get_max_Q_for_states(self, states):
     with eval_mode(
             self):  # @todo we might have trouble with the MC Dropout here
         value, advantage = self.model(states.squeeze(1))
         max_Q = (value + advantage).max(dim=1)[0]
     return max_Q