Exemplo n.º 1
0
class BaseAgent(ABC):
    def __init__(self,
                 env,
                 test_env,
                 log_dir,
                 num_steps=100000,
                 batch_size=64,
                 memory_size=1000000,
                 gamma=0.99,
                 multi_step=1,
                 target_entropy_ratio=0.98,
                 start_steps=20000,
                 update_interval=4,
                 target_update_interval=8000,
                 use_per=False,
                 num_eval_steps=125000,
                 max_episode_steps=27000,
                 log_interval=10,
                 eval_interval=1000,
                 device='cuda:0',
                 seed=0):

        super().__init__()
        self.env = env
        self.test_env = test_env

        # Set seed.
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        self.test_env.seed(2**31 - 1 - seed)
        # torch.backends.cudnn.deterministic = True  # It harms a performance.
        # torch.backends.cudnn.benchmark = False  # It harms a performance.

        self.device = torch.device(
            device if torch.cuda.is_available() else "cpu")

        # LazyMemory efficiently stores FrameStacked states.
        if use_per:
            beta_steps = (num_steps - start_steps) / update_interval
            self.memory = LazyPrioritizedMultiStepMemory(
                capacity=memory_size,
                state_shape=self.env.observation_space.shape,
                device=self.device,
                gamma=gamma,
                multi_step=multi_step,
                beta_steps=beta_steps)
        else:
            self.memory = LazyMultiStepMemory(
                capacity=memory_size,
                state_shape=self.env.observation_space.shape,
                device=self.device,
                gamma=gamma,
                multi_step=multi_step)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)

        self.writer = SummaryWriter(log_dir=self.summary_dir)
        self.train_return = RunningMeanStats(log_interval)

        self.steps = 0
        self.learning_steps = 0
        self.episodes = 0
        self.best_eval_score = -np.inf
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.gamma_n = gamma**multi_step
        self.start_steps = start_steps
        self.update_interval = update_interval
        self.target_update_interval = target_update_interval
        self.use_per = use_per
        self.num_eval_steps = num_eval_steps
        self.max_episode_steps = max_episode_steps
        self.log_interval = log_interval
        self.eval_interval = eval_interval

    def run(self):
        while True:
            self.train_episode()
            if self.steps > self.num_steps:
                break

    def is_update(self):
        return self.steps % self.update_interval == 0\
            and self.steps >= self.start_steps

    @abstractmethod
    def explore(self, state):
        pass

    @abstractmethod
    def exploit(self, state):
        pass

    @abstractmethod
    def update_target(self):
        pass

    @abstractmethod
    def calc_current_q(self, states, actions, rewards, next_states, dones):
        pass

    @abstractmethod
    def calc_target_q(self, states, actions, rewards, next_states, dones):
        pass

    @abstractmethod
    def calc_critic_loss(self, batch, weights):
        pass

    @abstractmethod
    def calc_policy_loss(self, batch, weights):
        pass

    @abstractmethod
    def calc_entropy_loss(self, entropies, weights):
        pass

    def train_episode(self):
        self.episodes += 1
        episode_return = 0.
        episode_steps = 0

        done = False
        state = self.env.reset()

        while (not done) and episode_steps <= self.max_episode_steps:

            if self.start_steps > self.steps:
                action = self.env.action_space.sample()
            else:
                action = self.explore(state)

            next_state, reward, done, _ = self.env.step(action)

            # Clip reward to [-1.0, 1.0].
            clipped_reward = max(min(reward, 1.0), -1.0)

            # To calculate efficiently, set priority=max_priority here.
            self.memory.append(state, action, clipped_reward, next_state, done)

            self.steps += 1
            episode_steps += 1
            episode_return += reward
            state = next_state

            if self.is_update():
                self.learn()

            if self.steps % self.target_update_interval == 0:
                self.update_target()

            if self.steps % self.eval_interval == 0:
                self.evaluate()
                self.save_models(os.path.join(self.model_dir, 'final'))

        # We log running mean of training rewards.
        self.train_return.append(episode_return)

        if self.episodes % self.log_interval == 0:
            self.writer.add_scalar('reward/train', self.train_return.get(),
                                   self.steps)

        print(f'Episode: {self.episodes:<4}  '
              f'Episode steps: {episode_steps:<4}  '
              f'Return: {episode_return:<5.1f}')

    def learn(self):
        assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\
            hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim')

        self.learning_steps += 1

        if self.use_per:
            batch, weights = self.memory.sample(self.batch_size)
        else:
            batch = self.memory.sample(self.batch_size)
            # Set priority weights to 1 when we don't use PER.
            weights = 1.

        q1_loss, q2_loss, errors, mean_q1, mean_q2 = \
            self.calc_critic_loss(batch, weights)
        policy_loss, entropies = self.calc_policy_loss(batch, weights)
        entropy_loss = self.calc_entropy_loss(entropies, weights)

        update_params(self.q1_optim, q1_loss)
        update_params(self.q2_optim, q2_loss)
        update_params(self.policy_optim, policy_loss)
        update_params(self.alpha_optim, entropy_loss)

        self.alpha = self.log_alpha.exp()

        if self.use_per:
            self.memory.update_priority(errors)

        if self.learning_steps % self.log_interval == 0:
            self.writer.add_scalar('loss/Q1',
                                   q1_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/Q2',
                                   q2_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/policy',
                                   policy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/alpha',
                                   entropy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/alpha',
                                   self.alpha.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q1', mean_q1,
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q2', mean_q2,
                                   self.learning_steps)
            self.writer.add_scalar('stats/entropy',
                                   entropies.detach().mean().item(),
                                   self.learning_steps)

    def evaluate(self):
        num_episodes = 0
        num_steps = 0
        total_return = 0.0

        while True:
            state = self.test_env.reset()
            episode_steps = 0
            episode_return = 0.0
            done = False
            while (not done) and episode_steps <= self.max_episode_steps:
                action = self.exploit(state)
                next_state, reward, done, _ = self.test_env.step(action)
                num_steps += 1
                episode_steps += 1
                episode_return += reward
                state = next_state

            num_episodes += 1
            total_return += episode_return

            if num_steps > self.num_eval_steps:
                break

        mean_return = total_return / num_episodes

        if mean_return > self.best_eval_score:
            self.best_eval_score = mean_return
            self.save_models(os.path.join(self.model_dir, 'best'))

        self.writer.add_scalar('reward/test', mean_return, self.steps)
        print('-' * 60)
        print(f'Num steps: {self.steps:<5}  ' f'return: {mean_return:<5.1f}')
        print('-' * 60)

    @abstractmethod
    def save_models(self, save_dir):
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    def __del__(self):
        self.env.close()
        self.test_env.close()
        self.writer.close()
Exemplo n.º 2
0
class BaseAgent(ABC):
    def __init__(self,
                 env,
                 test_env,
                 log_dir,
                 num_steps=100000,
                 batch_size=64,
                 memory_size=1000000,
                 gamma=0.99,
                 multi_step=1,
                 target_entropy_ratio=0.98,
                 start_steps=20000,
                 update_interval=4,
                 target_update_interval=8000,
                 use_per=False,
                 num_eval_steps=50000,
                 max_episode_steps=10000,
                 log_interval=10,
                 eval_interval=500,
                 cuda=True,
                 seed=0):
        super().__init__()
        self.env = env
        self.test_env = test_env

        # Set seed.
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        self.test_env.seed(2**31 - 1 - seed)
        # torch.backends.cudnn.deterministic = True  # It harms a performance.
        # torch.backends.cudnn.benchmark = False  # It harms a performance.

        self.device = torch.device(
            "cuda" if cuda and torch.cuda.is_available() else "cpu")

        # LazyMemory efficiently stores FrameStacked states.
        # if use_per:
        #     beta_steps = (num_steps - start_steps) / update_interval
        #     self.memory = LazyPrioritizedMultiStepMemory(
        #         capacity=memory_size,
        #         state_shape=self.env.observation_space.shape,
        #         device=self.device, gamma=gamma, multi_step=multi_step,
        #         beta_steps=beta_steps)
        # else:
        #     self.memory = LazyMultiStepMemory(
        #         capacity=memory_size,
        #         state_shape=self.env.observation_space.shape,
        #         device=self.device, gamma=gamma, multi_step=multi_step)

        self.memory = RecurrentMemory(
            capacity=memory_size,
            state_shape=self.env.observation_space.shape,
            device=self.device,
            sequen_len=6)

        self.log_dir = log_dir
        self.model_dir = os.path.join(log_dir, 'model')
        self.summary_dir = os.path.join(log_dir, 'summary')
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)

        self.writer = SummaryWriter(log_dir=self.summary_dir)
        self.train_return = RunningMeanStats(log_interval)

        self.steps = 0
        self.learning_steps = 0
        self.episodes = 0
        self.best_eval_score = -np.inf
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.gamma_n = gamma**multi_step
        self.start_steps = start_steps
        self.update_interval = update_interval
        self.target_update_interval = target_update_interval
        self.use_per = use_per
        self.num_eval_steps = num_eval_steps
        self.max_episode_steps = max_episode_steps
        self.log_interval = log_interval
        self.eval_interval = eval_interval

        self._states = self.env.reset()
        self.num_feats = self.env.observation_space.shape[0]
        self.sequence_length = 6

    def run(self):
        while True:
            self.train_episode()
            if self.steps > self.num_steps:
                break

    def is_update(self):
        return self.steps % self.update_interval == 0\
            and self.steps >= self.start_steps

    @abstractmethod
    def explore(self, state):
        pass

    @abstractmethod
    def exploit(self, state):
        pass

    @abstractmethod
    def update_target(self):
        pass

    @abstractmethod
    def calc_current_q(self, states, actions, rewards, next_states, dones):
        pass

    @abstractmethod
    def calc_target_q(self, states, actions, rewards, next_states, dones):
        pass

    @abstractmethod
    def calc_critic_loss(self, batch, weights):
        pass

    @abstractmethod
    def calc_policy_loss(self, batch, weights):
        pass

    @abstractmethod
    def calc_entropy_loss(self, entropies, weights):
        pass

    def train_episode(self):
        self.episodes += 1
        episode_return = 0.
        episode_steps = 0

        done = False
        state = self.env.reset()

        while (not done) and episode_steps <= self.max_episode_steps:

            if self.start_steps > self.steps:
                action = self.env.action_space.sample()
            else:
                action = self.explore(state)

            next_state, reward, done, _ = self.env.step(action)

            # Clip reward to [-1.0, 1.0].
            #clipped_reward = max(min(reward, 1.0), -1.0)

            # To calculate efficiently, set priority=max_priority here.
            self.memory.push(state, action, reward, next_state, done)

            self.steps += 1
            episode_steps += 1
            episode_return += reward
            state = next_state

            if self.is_update():
                self.learn()

            if self.steps % self.target_update_interval == 0:
                self.update_target()
########################
            if self.steps % self.eval_interval == 0:
                self.evaluate()
                self.save_models(os.path.join(self.model_dir, 'final'))

        # We log running mean of training rewards.
        self.train_return.append(episode_return)

        if self.episodes % self.log_interval == 0:
            self.writer.add_scalar('reward/train', self.train_return.get(),
                                   self.steps)

        print(f'Episode: {self.episodes:<4}  '
              f'Episode steps: {episode_steps:<4}  '
              f'Return: {episode_return:<5.1f}')

    def prep_minibatch(self):
        transitions = self.memory.sample(self.batch_size)

        batch_state, batch_action, batch_reward, batch_next_state = zip(
            *transitions)

        shape = (self.batch_size, self.sequence_length) + self.num_feats

        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float).view(shape)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.long).view(
                                        self.batch_size, self.sequence_length,
                                        -1)
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).view(
                                        self.batch_size, self.sequence_length)
        #get set of next states for end of each sequence
        batch_next_state = tuple([
            batch_next_state[i] for i in range(len(batch_next_state))
            if (i + 1) % (self.sequence_length) == 0
        ])

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch_next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)

        try:  #sometimes all next states are false, especially with nstep returns
            non_final_next_states = torch.tensor(
                [s for s in batch_next_state if s is not None],
                device=self.device,
                dtype=torch.float).unsqueeze(dim=1)
            non_final_next_states = torch.cat(
                [batch_state[non_final_mask, 1:, :], non_final_next_states],
                dim=1)
            empty_next_state_values = False
        except:
            empty_next_state_values = True

        return batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values

    def learn(self):
        assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\
            hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim')

        self.learning_steps += 1

        # if self.use_per:
        #     batch, weights = self.memory.sample(self.batch_size)
        # else:
        #     batch = self.memory.sample(self.batch_size)
        #     # Set priority weights to 1 when we don't use PER.
        #     weights = 1.
        batch = self.prep_minibatch()
        weights = 1.

        q1_loss, q2_loss, errors, mean_q1, mean_q2 = \
            self.calc_critic_loss(batch, weights)
        policy_loss, entropies = self.calc_policy_loss(batch, weights)
        entropy_loss = self.calc_entropy_loss(entropies, weights)

        update_params(self.q1_optim, q1_loss)
        update_params(self.q2_optim, q2_loss)
        update_params(self.policy_optim, policy_loss)
        update_params(self.alpha_optim, entropy_loss)

        self.alpha = self.log_alpha.exp()

        if self.use_per:
            self.memory.update_priority(errors)

        if self.learning_steps % self.log_interval == 0:
            self.writer.add_scalar('loss/Q1',
                                   q1_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/Q2',
                                   q2_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/policy',
                                   policy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/alpha',
                                   entropy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/alpha',
                                   self.alpha.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q1', mean_q1,
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q2', mean_q2,
                                   self.learning_steps)
            self.writer.add_scalar('stats/entropy',
                                   entropies.detach().mean().item(),
                                   self.learning_steps)

    def evaluate(self):
        num_episodes = 0
        num_steps = 0
        total_return = 0.0

        while True:
            state = self.test_env.reset()
            episode_steps = 0
            episode_return = 0.0
            done = False
            state_memory = state.reshape(1, -1)  # batch = 1, seq_len, features
            while (not done) and episode_steps <= self.max_episode_steps:
                #print(episode_steps )
                action = self.exploit(state_memory)
                #print(action)
                next_state, reward, done, _ = self.test_env.step(action)
                num_steps += 1
                episode_steps += 1
                episode_return += reward
                state = next_state.reshape(1, -1)
                state_memory = np.insert(state_memory,
                                         state_memory.shape[0],
                                         values=state,
                                         axis=0)
                if episode_steps >= 6:
                    state_memory = np.delete(state_memory, 0, axis=0)

            num_episodes += 1
            total_return += episode_return

            if num_steps > self.num_eval_steps:
                break

        mean_return = total_return / num_episodes

        if mean_return > self.best_eval_score:
            self.best_eval_score = mean_return
            self.save_models(os.path.join(self.model_dir, 'best'))

        self.writer.add_scalar('reward/test', mean_return, self.steps)
        print('-' * 60)
        print(f'Num steps: {self.steps:<5}  ' f'return: {mean_return:<5.1f}')
        print('-' * 60)

    @abstractmethod
    def save_models(self, save_dir):
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    def __del__(self):
        self.env.close()
        self.test_env.close()
        self.writer.close()