示例#1
0
class AgentInference:
    def __init__(
            self,
            nn, device,
            distribution,
            distribution_args,
            testing=False
    ):
        self.nn = nn
        self.device = device
        self.nn.to(device)
        self.distribution = distributions_dict[distribution](**distribution_args)
        self.distribution_with_params = False
        if hasattr(self.distribution, 'has_state'):
            self.distribution_with_params = True
            self.distribution.to(device)
        self.obs_normalizer = None
        self.testing = testing

    def load_state_dict(self, state):
        self.nn.load_state_dict(state['nn'])
        if self.distribution_with_params:
            self.distribution.load_state_dict(state['distribution'])

    def load(self, filename, **kwargs):
        checkpoint = torch.load(filename, **kwargs)
        agent_state = checkpoint['agent']
        self.load_state_dict(agent_state)
        if 'obs_normalizer' in checkpoint:
            self.obs_normalizer = RunningMeanStd()
            self.obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])

    def train(self):
        self.nn.train()

    def eval(self):
        self.nn.eval()

    def _act(self, observations, return_pi, deterministic):
        if self.obs_normalizer is not None:
            mean, var = self.obs_normalizer.mean, self.obs_normalizer.var
            observations = (observations - mean) / np.sqrt(var + 1e-8)
        with torch.no_grad():
            policy, _ = self.nn(torch.tensor(observations, dtype=torch.float32, device=self.device))
            # RealNVP requires 'no_grad' here
            action, log_prob = self.distribution.sample(policy, deterministic)
        if return_pi:
            return action, policy
        else:
            return action, log_prob

    def act(self, observations, return_pi=False, deterministic=False):
        """
        :param observations: np.array of observation, shape = [T, B, dim(obs)]
        :param return_pi: True or False. If True then method return full pi, not just log(pi(a))
        :param deterministic: True or False
        :return: action and log_prob during data gathering for training, just action during testing
        """
        if self.testing:
            observations = [[observations]]
        action, log_prob = self._act(observations, return_pi, deterministic)
        if self.testing:
            return action.cpu().numpy()[0, 0]
        return action, log_prob
示例#2
0
class OnPolicyTrainer(BaseTrainer):
    def __init__(
            self,
            agent_online, agent_train,
            return_pi,
            train_env,
            update_period,
            normalize_obs, train_obs_normalizer,
            scale_reward, normalize_reward, train_reward_normalizer,
            obs_clip, reward_clip,
            warm_up_steps=0,
            **kwargs
    ):
        """On-policy trainer

        :param agent_online: agent which collects data
        :param agent_train: agent which performs train-ops
        :param return_pi: if True then 'act()' method will return
                          full policy parametrization instead of log-prob of sampled action
        :param train_env: environment for collecting training data
        :param test_env: environment for testing agent once per epoch
        :param update_period: number of train-ops after which
                              online agent loads weights of training agent
        :param normalize_obs: if True then observations will be normalized
                              by running mean and std of collected observations
        :param train_obs_normalizer: if True then running mean and std of obs_normalizer
                                     will be updated each environment step
        :param scale_reward: if True then reward will be normalized
                             by running mean and std of collected episodes return
        :param normalize_reward: if True then reward will be normalized
                                 by running mean and std of collected rewards
        :param train_reward_normalizer: if True then running mean and std of reward_normalizer
                                        will be updated each environment step
        :param obs_clip: abs(observation) will be clipped to this value after normalization
        :param reward_clip: abs(reward) will be clipped to this value after normalization
        :param warm_up_steps: number of steps not to update online agent,
                              useful to continue training from checkpoint
        :param kwargs: test_env and log_dir
        """
        super().__init__(**kwargs)

        self._agent_online = agent_online  # gather rollouts
        self._agent_train = agent_train  # do train-ops
        self._update_online_agent()
        # weights of online agent updated once in 'update_period' & at the end of training epoch
        self._update_period = update_period
        self._warm_up_steps = warm_up_steps

        # if return_pi then agent append 'policy' tensor
        # to rollout, else it append 'log_pi_a'
        self._return_pi = return_pi
        # both environments should:
        #   vectorized
        #   reset environment automatically
        self._train_env = train_env

        # normalizers:
        self._obs_normalizer = RunningMeanStd() if normalize_obs else None
        self._train_obs_normalizer = train_obs_normalizer
        self._obs_clip = obs_clip
        assert not (normalize_reward and scale_reward), \
            'reward may be normalized or scaled, but not both at the same time!'
        self._reward_scaler = RunningMeanStd() if scale_reward else None
        self._reward_normalizer = RunningMeanStd() if normalize_reward else None
        self._train_reward_normalizer = train_reward_normalizer
        self._reward_clip = reward_clip

        # store episode reward, length, return and number for each train environment
        self._gamma = self._agent_train.gamma
        self._env_reward = np.zeros(train_env.num_envs, dtype=np.float32)
        self._env_episode_len = np.zeros(train_env.num_envs, dtype=np.int32)
        self._env_return = np.zeros(train_env.num_envs, dtype=np.float32)
        self._env_episode = np.zeros(train_env.num_envs, dtype=np.int32)

    def save(self, filename):
        state_dict = {'agent': self._agent_train.state_dict()}
        if self._obs_normalizer is not None:
            state_dict['obs_normalizer'] = self._obs_normalizer.state_dict()
        if self._reward_normalizer is not None:
            state_dict['reward_normalizer'] = self._reward_normalizer.state_dict()
        torch.save(state_dict, filename)

    def load(self, filename):
        checkpoint = torch.load(filename)
        self._agent_online.load_state_dict(checkpoint['agent'])
        self._agent_train.load_state_dict(checkpoint['agent'])
        if 'obs_normalizer' in checkpoint and self._obs_normalizer is not None:
            self._obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])
        if 'reward_normalizer' in checkpoint and self._reward_normalizer is not None:
            self._reward_normalizer.load_state_dict(checkpoint['reward_normalizer'])

    @staticmethod
    def stack_infos(infos):
        keys = infos[0].keys()
        stacked_info = dict()
        for key in keys:
            values = []
            for info in infos:
                values.append(info[key])
            stacked_info[key] = np.stack(values)
        return stacked_info

    # def _act(self, fake_agent, observation, deterministic, training=False, **kwargs):
    #     # 'fake_agent' arg is unused to make this method work in BaseTrainer.test_agent_service
    #     observation = self._normalize_observation(observation, training)
    #     (action, log_prob), act_time = super()._act(
    #         self._agent_online, observation, deterministic,
    #         return_pi=self._return_pi
    #     )
    #     return (action, log_prob), act_time

    def _act(self, fake_agent, observation, deterministic, need_norm=True, **kwargs):
        # this method used ONLY inside base class '._test_agent_service()' method
        # 'fake_agent' arg is unused to make this method work in BaseTrainer.test_agent_service
        if need_norm:
            observation = self._normalize_observation(observation, False)
        (action, log_prob), act_time = super()._act(
            self._agent_online, observation, deterministic,
            return_pi=self._return_pi
        )
        return (action, log_prob), act_time

    def _normalize_observation(self, observation, training):
        if self._obs_normalizer is not None:
            if training:
                self._obs_normalizer.update(observation)
            mean, var = self._obs_normalizer.mean, self._obs_normalizer.var
            observation = (observation - mean) / np.sqrt(var + 1e-8)
            observation = np.clip(observation, -self._obs_clip, self._obs_clip)
        return observation

    def _normalize_reward(self, reward, training):
        # 'baselines' version:
        if self._reward_scaler is not None:
            if training:
                self._reward_scaler.update(self._env_return)
            var = self._reward_scaler.var
            reward = reward / np.sqrt(var + 1e-8)
            reward = np.clip(reward, -self._reward_clip, self._reward_clip)

        # 'my' version:
        if self._reward_normalizer is not None:
            if training:
                self._reward_normalizer.update(reward)
            mean, var = self._reward_normalizer.mean, self._reward_normalizer.var
            reward = (reward - mean) / np.sqrt(var + 1e-8)
            reward = np.clip(reward, -self._reward_clip, self._reward_clip)

        return reward

    @time_it
    def _gather_rollout(self, observation, rollout_len):
        # this function only called when agent is trained
        # initial observation (i.e. at the beginning of training) does not care about normalization
        observations, actions, rewards, is_done = [observation], [], [], []
        log_probs = []
        raw_rewards = []

        mean_act_time = 0
        mean_env_time = 0

        for _ in range(rollout_len):
            # on-policy trainer does not requires actions to be differentiable
            # however, agent may be used by different algorithms which may require that
            (action, log_prob), act_time = self._act(None, observation, deterministic=False, need_norm=False)
            mean_act_time += act_time

            env_step_result, env_time = self._env_step(self._train_env, action)
            observation, reward, done, _ = env_step_result
            mean_env_time += env_time

            self._env_reward += reward
            self._env_episode_len += 1
            self._env_return = reward + self._gamma * self._env_return

            raw_rewards.append(np.copy(reward))

            observation = self._normalize_observation(observation, training=self._train_obs_normalizer)
            reward = self._normalize_reward(reward, training=self._train_reward_normalizer)

            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            is_done.append(done)
            log_probs.append(log_prob)

            self._done_callback(done)

        mean_act_time /= rollout_len
        mean_env_time /= rollout_len

        rollout = observations, actions, rewards, is_done, log_probs
        gather_result = rollout, raw_rewards, observation
        mean_time = mean_act_time, mean_env_time
        return gather_result, mean_time

    def _done_callback(self, done):
        if np.any(done):
            for i, d in enumerate(done):
                if d:
                    self._writer.add_scalars(
                        'agents/train_reward/',
                        {f'agent_{i}': self._env_reward[i]},
                        self._env_episode[i]
                    )
                    self._writer.add_scalars(
                        'agents/train_ep_len/',
                        {f'agent_{i}': self._env_episode_len[i]},
                        self._env_episode[i]
                    )
                    self._env_reward[i] = 0.0
                    self._env_episode_len[i] = 0
                    self._env_return[i] = 0.0
                    self._env_episode[i] += 1

    def _train_step(self, observation, rollout_len, step):
        # gather rollout -> train on it -> write training logs
        (gather_result, mean_time), gather_time = self._gather_rollout(observation, rollout_len)

        rollout, raw_rewards, observation = gather_result
        mean_act_time, mean_env_time = mean_time

        train_logs, time_logs = self._agent_train.train_on_rollout(rollout)
        train_logs['reward_mean'] = np.mean(raw_rewards)
        train_logs['reward_std'] = np.std(raw_rewards)

        time_logs['mean_act_time'] = mean_act_time
        time_logs['mean_env_time'] = mean_env_time
        time_logs['gather_rollout_time'] = gather_time

        self._write_logs('train/', train_logs, step)
        self._write_logs('time/', time_logs, step)
        return observation

    def _update_online_agent(self):
        self._agent_online.load_state_dict(self._agent_train.state_dict())

    def _save_n_test(self, epoch, n_tests):
        checkpoint_name = self._log_dir + 'checkpoints/' + f'epoch_{epoch}.pth'
        self.save(checkpoint_name)
        self._test_agent(epoch, n_tests, self._agent_online)

    def train(self, n_epoch, n_steps_per_epoch, rollout_len, n_tests_per_epoch):
        """
        Run training for 'n_epoch', each epoch takes 'n_steps' training steps
        on rollouts of len 'rollout_len'.
        At the end of each epoch run 'n_tests' tests and saves checkpoint

        :param n_epoch:
        :param n_steps_per_epoch:
        :param rollout_len:
        :param n_tests_per_epoch:
        :return:
        """
        observation = self._train_env.reset()
        self._save_n_test(0, n_tests_per_epoch)

        self._agent_train.train()  # always in training mode
        for epoch in range(n_epoch):
            self._agent_online.train()
            p_bar = trange(n_steps_per_epoch, ncols=90, desc=f'epoch_{epoch}')
            for train_step in p_bar:
                step = train_step + epoch * n_steps_per_epoch
                observation = self._train_step(
                    observation, rollout_len, step
                )
                if step > self._warm_up_steps and (step + 1) % self._update_period == 0:
                    self._update_online_agent()

            self._update_online_agent()
            self._save_n_test(epoch + 1, n_tests_per_epoch)
        self._writer.close()
class AgentInference:
    def __init__(self, nn, device, distribution, distribution_args):
        self.nn = nn
        self.device = device
        self.nn.to(device)
        self.distribution = distributions_dict[distribution](
            **distribution_args)
        self.distribution_with_params = False
        if hasattr(self.distribution, 'has_state'):
            self.distribution_with_params = True
            self.distribution.to(device)
        self.obs_normalizer = None

    def load_state_dict(self, state):
        self.nn.load_state_dict(state['nn'])
        if self.distribution_with_params:
            self.distribution.load_state_dict(state['distribution'])

    def load(self, filename, **kwargs):
        checkpoint = torch.load(filename, **kwargs)
        agent_state = checkpoint['agent']
        self.load_state_dict(agent_state)
        if 'obs_normalizer' in checkpoint:
            self.obs_normalizer = RunningMeanStd()
            self.obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])

    def train(self):
        self.nn.train()

    def eval(self):
        self.nn.eval()

    def act(self, observations, deterministic):
        """
        :param observations: np.array of observation, shape = [B, dim(obs)]
        :param deterministic: default to False, if True then action will be chosen as policy mean
        :return: action and log_prob, both np.array with shape = [B, dim(action)]
        """
        if self.obs_normalizer is not None:
            mean, var = self.obs_normalizer.mean, self.obs_normalizer.var
            observations = (observations - mean) / max(np.sqrt(var), 1e-6)
        with torch.no_grad():
            nn_result = self.nn(
                torch.tensor([observations],
                             dtype=torch.float32,
                             device=self.device))
            policy, value = nn_result['policy'], nn_result['value']
            # RealNVP requires 'no_grad' here
            action, log_prob = self.distribution.sample(policy, deterministic)

        policy = policy[0].cpu().numpy()
        value = value[0].cpu().numpy()
        action = action[0].cpu().numpy()
        log_prob = log_prob[0].cpu().numpy()

        result = {
            'policy': policy,
            'value': value,
            'action': action,
            'log_prob': log_prob
        }
        return result
示例#4
0
class OnPolicyTrainer(BaseTrainer):
    def __init__(
            self,
            agent_online,
            agent_train,
            train_env,
            # kwargs comes from config
            update_period=1,
            normalize_obs=False,
            train_obs_normalizer=False,
            scale_reward=False,
            normalize_reward=False,
            train_reward_normalizer=False,
            obs_clip=float('inf'),
            reward_clip=float('inf'),
            warm_up_steps=0,
            **kwargs):
        """On-policy trainer

        :param agent_online: agent which collects data
        :param agent_train: agent which performs train-ops
        :param train_env: environment for collecting training data
        :param test_env: environment for testing agent once per epoch
        :param update_period: number of train-ops after which
                              online agent loads weights of training agent
        :param normalize_obs: if True then observations will be normalized
                              by running mean and std of collected observations
        :param train_obs_normalizer: if True then running mean and std of obs_normalizer
                                     will be updated each environment step
        :param scale_reward: if True then reward will be normalized
                             by running mean and std of collected episodes return
        :param normalize_reward: if True then reward will be normalized
                                 by running mean and std of collected rewards
        :param train_reward_normalizer: if True then running mean and std of reward_normalizer
                                        will be updated each environment step
        :param obs_clip: abs(observation) will be clipped to this value after normalization
        :param reward_clip: abs(reward) will be clipped to this value after normalization
        :param warm_up_steps: number of steps not to update online agent,
                              useful to continue training from checkpoint
        :param kwargs: test_env and log_dir
        """
        super().__init__(**kwargs)

        self._agent_online = agent_online  # gather rollouts
        self._agent_train = agent_train  # do train-ops
        self._update_online_agent()
        # weights of online agent updated once in 'update_period' & at the end of training epoch
        self._update_period = update_period
        self._warm_up_steps = warm_up_steps

        # both environments should:
        #   vectorized
        #   reset environment automatically
        self._train_env = train_env

        # normalizers:
        self._obs_normalizer = RunningMeanStd(
            obs_clip) if normalize_obs else None
        self._train_obs_normalizer = train_obs_normalizer
        assert not (normalize_reward and scale_reward), \
            'reward may be normalized or scaled, but not both at the same time!'
        self._normalize_reward = normalize_reward
        self._scale_reward = scale_reward
        self._reward_normalizer = RunningMeanStd(reward_clip) \
            if normalize_reward or scale_reward else None
        self._train_reward_normalizer = train_reward_normalizer

        self._gamma = self._agent_train.gamma
        # store episode reward, length, return and number for each train environment
        self._env_total_reward = np.zeros(train_env.num_envs, dtype=np.float32)
        self._env_episode_len = np.zeros(train_env.num_envs, dtype=np.int32)
        self._env_discounted_return = np.zeros(train_env.num_envs,
                                               dtype=np.float32)
        self._env_episode_number = np.zeros(train_env.num_envs, dtype=np.int32)

    def save(self, filename):
        state_dict = {'agent': self._agent_train.state_dict()}
        if self._obs_normalizer is not None:
            state_dict['obs_normalizer'] = self._obs_normalizer.state_dict()
        if self._reward_normalizer is not None:
            state_dict[
                'reward_normalizer'] = self._reward_normalizer.state_dict()
        torch.save(state_dict, filename)

    def load(self, filename):
        checkpoint = torch.load(filename)
        self._agent_online.load_state_dict(checkpoint['agent'])
        self._agent_train.load_state_dict(checkpoint['agent'])
        if 'obs_normalizer' in checkpoint and self._obs_normalizer is not None:
            self._obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])
        if 'reward_normalizer' in checkpoint and self._reward_normalizer is not None:
            self._reward_normalizer.load_state_dict(
                checkpoint['reward_normalizer'])

    def _act(self, fake_agent, observation, deterministic, **kwargs):
        # this method used ONLY inside base class '._test_agent_service()' method
        # 'fake_agent' arg is unused to make this method work in BaseTrainer.test_agent_service
        observation = self._normalize_observation_fn(observation, False)
        return super()._act(self._agent_online, observation, deterministic)

    def _normalize_observation_fn(self, observation, training):
        if self._obs_normalizer is not None:
            if training:
                self._obs_normalizer.update(observation)
            observation = self._obs_normalizer.normalize(observation)
        return observation

    def _normalize_reward_fn(self, reward, training):
        # 'baselines' version:
        if self._scale_reward is not None:
            if training:
                self._reward_normalizer.update(self._env_discounted_return)
            reward = self._reward_normalizer.scale(reward)

        # 'my' version:
        if self._reward_normalizer is not None:
            if training:
                self._reward_normalizer.update(reward)
            reward = self._reward_normalizer.normalize(reward)

        return reward

    def _update_running_statistics(self, raw_reward):
        self._env_total_reward += raw_reward
        self._env_episode_len += 1
        self._env_discounted_return = raw_reward + self._gamma * self._env_discounted_return

    def _normalize_rollout(self, rollout):
        if self._obs_normalizer is not None:
            observations = rollout['observations']
            normalized_observations = self._obs_normalizer.normalize_vec(
                observations)
            rollout['observations'] = normalized_observations
            if self._train_obs_normalizer:
                self._obs_normalizer.update_vec(observations)

        if self._reward_normalizer is not None:
            rewards = rollout['rewards']
            if self._normalize_reward:
                normalized_rewards = self._reward_normalizer.normalize_vec(
                    rewards)
                rollout['rewards'] = normalized_rewards
                if self._train_reward_normalizer:
                    self._reward_normalizer.update_vec(rewards)
            elif self._scale_reward:
                normalized_rewards = self._reward_normalizer.scale_vec(rewards)
                rollout['rewards'] = normalized_rewards
                if self._train_reward_normalizer:
                    # TODO: this update is incorrect,
                    #  need to store last returns at each step during rollout gathering
                    self._reward_normalizer.update_vec(
                        self._env_discounted_return)

        return rollout

    def _act_and_step(self, observation):
        # calculate action, step environment, collect results into dict
        act_result, act_time = self._act(None,
                                         observation,
                                         deterministic=False)
        action = act_result['action']

        env_step_result, env_time = self._env_step(self._train_env, action)
        observation, reward, done, info = env_step_result
        # convert info from tuple of dicts to dict of np.arrays
        info = {
            key: np.stack([info_env.get(key, None) for info_env in info])
            for key in info[0].keys()
        }
        self._done_callback(done)

        self._update_running_statistics(reward)
        # reward, observation = self._call_after_env_step(raw_reward, observation)
        result = {
            'observation': observation,
            'reward': reward,
            'done': done,
            'info': info,
            **act_result
        }
        return result, act_time, env_time

    @staticmethod
    def _list_of_dicts_to_dict_of_np_array(list_of_dicts, super_key):
        # useful to convert observations and infos
        result = {
            key: np.stack([x[super_key][key] for x in list_of_dicts])
            for key in list_of_dicts[0][super_key].keys()
        }
        return result

    def _rollout_from_list_to_dict(self, rollout, first_observation):
        plural = {
            'observation': 'observations',
            'action': 'actions',
            'reward': 'rewards',
            'done': 'is_done',
        }
        # observation may be of type dict so it requires 'special' concatenation
        if type(rollout[0]['observation']) is dict:
            cat_observations = self._list_of_dicts_to_dict_of_np_array(
                [{
                    'observation': first_observation
                }, *rollout], 'observation')
        else:
            cat_observations = np.stack([
                x['observation'] for x in [{
                    'observation': first_observation
                }, *rollout]
            ])

        cat_infos = self._list_of_dicts_to_dict_of_np_array(rollout, 'info')

        keys = rollout[0].keys()
        keys = [k for k in keys if k not in ['observation', 'info']]
        rollout = {
            plural.get(k, k): np.stack([x[k] for x in rollout])
            for k in keys
        }
        rollout['observations'] = cat_observations
        rollout['infos'] = cat_infos
        return rollout

    def _gather_rollout(self, observation, rollout_len):
        # this function is called only when agent is training
        start_time = time.time()

        first_observation = observation
        rollout = []

        mean_act_time = 0
        mean_env_time = 0

        for _ in range(rollout_len):
            act_step_result, act_time, env_time = self._act_and_step(
                observation)
            observation = act_step_result['observation']
            rollout.append(act_step_result)
            mean_act_time += act_time
            mean_env_time += env_time

        mean_act_time /= rollout_len
        mean_env_time /= rollout_len

        # now rollout is a list of dicts, convert it to dict of 'numpy.array'
        rollout = self._rollout_from_list_to_dict(rollout, first_observation)

        elapsed_time = time.time() - start_time

        rewards = rollout['rewards']
        rollout_log = {
            'reward_mean': rewards.mean(),
            'reward_std': rewards.std()
        }
        time_log = {
            'mean_act_time': mean_act_time,
            'mean_env_time': mean_env_time,
            'gather_rollout_time': elapsed_time
        }
        return observation, rollout, rollout_log, time_log

    def _small_done_callback(self, i):
        self._writer.add_scalars('agents/train_reward/',
                                 {f'agent_{i}': self._env_total_reward[i]},
                                 self._env_episode_number[i])
        self._writer.add_scalars('agents/train_ep_len/',
                                 {f'agent_{i}': self._env_episode_len[i]},
                                 self._env_episode_number[i])
        self._env_total_reward[i] = 0.0
        self._env_episode_len[i] = 0
        self._env_discounted_return[i] = 0.0
        self._env_episode_number[i] += 1

    def _done_callback(self, done):
        if np.any(done):
            for i, d in enumerate(done):
                if d:
                    self._small_done_callback(i)

    def _train_step(self, observation, rollout_len, step, train_agent=True):
        # gather rollout -> train on it -> write training logs
        observation, rollout, rollout_log, time_log = self._gather_rollout(
            observation, rollout_len)
        rollout = self._normalize_rollout(rollout)

        if train_agent:
            train_logs, time_logs = self._agent_train.train_on_rollout(rollout)
        else:
            train_logs, time_logs = dict(), dict()

        train_logs.update(rollout_log)
        time_logs.update(time_log)
        self._write_logs('train/', train_logs, step)
        self._write_logs('time/', time_logs, step)
        return observation

    def _update_online_agent(self):
        self._agent_online.load_state_dict(self._agent_train.state_dict())

    def train(self, n_epoch, n_steps_per_epoch, rollout_len,
              n_tests_per_epoch):
        """
        Run training for 'n_epoch', each epoch takes 'n_steps' training steps
        on rollouts of len 'rollout_len'.
        At the end of each epoch run 'n_tests' tests and saves checkpoint

        :param n_epoch:
        :param n_steps_per_epoch:
        :param rollout_len:
        :param n_tests_per_epoch:
        :return:
        """
        observation = self._train_env.reset()
        self._save_n_test(0, n_tests_per_epoch, self._agent_online)

        self._agent_train.train()  # always in training mode

        if self._warm_up_steps > 0:
            # just update normalizers statistics without training agent
            p_bar = trange(self._warm_up_steps, ncols=90, desc='warm_up')
            for step in p_bar:
                observation = self._train_step(observation,
                                               rollout_len,
                                               step,
                                               train_agent=False)

        for epoch in range(n_epoch):
            self._agent_online.train()
            p_bar = trange(n_steps_per_epoch, ncols=90, desc=f'epoch_{epoch}')
            for train_step in p_bar:
                step = train_step + epoch * n_steps_per_epoch + self._warm_up_steps
                observation = self._train_step(observation, rollout_len, step)
                if (step + 1) % self._update_period == 0:
                    self._update_online_agent()

            self._update_online_agent()
            self._save_n_test(epoch + 1, n_tests_per_epoch, self._agent_online)
        self._writer.close()
示例#5
0
class AgentInference:
    def __init__(self, nn, device, distribution, distribution_args):
        self.nn = nn
        self.device = device
        self.nn.to(device)
        self.distribution = distributions_dict[distribution](
            **distribution_args)
        self.distribution_with_params = False
        if hasattr(self.distribution, 'has_state'):
            self.distribution_with_params = True
            self.distribution.to(device)
        self.obs_normalizer = None

    def load_state_dict(self, state):
        self.nn.load_state_dict(state['nn'])
        if self.distribution_with_params:
            self.distribution.load_state_dict(state['distribution'])

    def load(self, filename, **kwargs):
        checkpoint = torch.load(filename, **kwargs)
        agent_state = checkpoint['agent']
        self.load_state_dict(agent_state)
        if 'obs_normalizer' in checkpoint:
            self.obs_normalizer = RunningMeanStd()
            self.obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])

    def train(self):
        self.nn.train()

    def eval(self):
        self.nn.eval()

    def _t(self, x):
        # observation may be dict itself (in goal-augmented or multi-part observation envs)
        if type(x) is dict:
            x_t = {
                key: torch.tensor(value,
                                  dtype=torch.float32,
                                  device=self.device)
                for key, value in x.items()
            }
        else:
            x_t = torch.tensor(x, dtype=torch.float32, device=self.device)
        return x_t

    def act(self, observation, deterministic):
        """
        :param observation: np.array of observation, shape = [B, dim(obs)]
        :param deterministic: default to False, if True then action will be chosen as policy mean
        :return: action and log_prob, both np.array with shape = [B, dim(action)]
        """
        if self.obs_normalizer is not None:
            observation = self.obs_normalizer.normalize(observation)
        with torch.no_grad():
            if type(observation) is dict:
                observation = {
                    key: value[None, :]
                    for key, value in observation.items()
                }
            else:
                observation = [observation]
            nn_result = self.nn(self._t(observation))
            policy, value = nn_result['policy'], nn_result['value']
            # RealNVP requires 'no_grad' here
            action, log_prob = self.distribution.sample(policy, deterministic)

        policy = policy[0].cpu().numpy()
        value = value[0].cpu().numpy()
        action = action[0].cpu().numpy()
        log_prob = log_prob[0].cpu().numpy()

        result = {
            'policy': policy,
            'value': value,
            'action': action,
            'log_prob': log_prob
        }
        return result

    def log_prob(self, observations, actions):
        with torch.no_grad():
            nn_result = self.nn(self._t(observations))
            policy, value = nn_result['policy'], nn_result['value']
            log_prob = self.distribution.log_prob(policy, self._t(actions))
        result = {'value': value, 'log_prob': log_prob}
        return result
class OnPolicyTrainer(BaseTrainer):
    def __init__(
            self,
            agent_online,
            agent_train,
            train_env,
            # kwargs comes from config
            update_period=1,
            normalize_obs=False,
            train_obs_normalizer=False,
            scale_reward=False,
            normalize_reward=False,
            train_reward_normalizer=False,
            obs_clip=float('inf'),
            reward_clip=float('inf'),
            warm_up_steps=0,
            **kwargs):
        """On-policy trainer

        :param agent_online: agent which collects data
        :param agent_train: agent which performs train-ops
        :param train_env: environment for collecting training data
        :param test_env: environment for testing agent once per epoch
        :param update_period: number of train-ops after which
                              online agent loads weights of training agent
        :param normalize_obs: if True then observations will be normalized
                              by running mean and std of collected observations
        :param train_obs_normalizer: if True then running mean and std of obs_normalizer
                                     will be updated each environment step
        :param scale_reward: if True then reward will be normalized
                             by running mean and std of collected episodes return
        :param normalize_reward: if True then reward will be normalized
                                 by running mean and std of collected rewards
        :param train_reward_normalizer: if True then running mean and std of reward_normalizer
                                        will be updated each environment step
        :param obs_clip: abs(observation) will be clipped to this value after normalization
        :param reward_clip: abs(reward) will be clipped to this value after normalization
        :param warm_up_steps: number of steps not to update online agent,
                              useful to continue training from checkpoint
        :param kwargs: test_env and log_dir
        """
        super().__init__(**kwargs)

        self._agent_online = agent_online  # gather rollouts
        self._agent_train = agent_train  # do train-ops
        self._update_online_agent()
        # weights of online agent updated once in 'update_period' & at the end of training epoch
        self._update_period = update_period
        self._warm_up_steps = warm_up_steps

        # both environments should:
        #   vectorized
        #   reset environment automatically
        self._train_env = train_env

        # normalizers:
        self._obs_normalizer = RunningMeanStd() if normalize_obs else None
        self._train_obs_normalizer = train_obs_normalizer
        self._obs_clip = obs_clip
        assert not (normalize_reward and scale_reward), \
            'reward may be normalized or scaled, but not both at the same time!'
        self._reward_scaler = RunningMeanStd() if scale_reward else None
        self._reward_normalizer = RunningMeanStd(
        ) if normalize_reward else None
        self._train_reward_normalizer = train_reward_normalizer
        self._reward_clip = reward_clip

        self._gamma = self._agent_train.gamma
        # store episode reward, length, return and number for each train environment
        self._env_total_reward = np.zeros(train_env.num_envs, dtype=np.float32)
        self._env_episode_len = np.zeros(train_env.num_envs, dtype=np.int32)
        self._env_discounted_return = np.zeros(train_env.num_envs,
                                               dtype=np.float32)
        self._env_episode_number = np.zeros(train_env.num_envs, dtype=np.int32)

    def save(self, filename):
        state_dict = {'agent': self._agent_train.state_dict()}
        if self._obs_normalizer is not None:
            state_dict['obs_normalizer'] = self._obs_normalizer.state_dict()
        if self._reward_normalizer is not None:
            state_dict[
                'reward_normalizer'] = self._reward_normalizer.state_dict()
        torch.save(state_dict, filename)

    def load(self, filename):
        checkpoint = torch.load(filename)
        self._agent_online.load_state_dict(checkpoint['agent'])
        self._agent_train.load_state_dict(checkpoint['agent'])
        if 'obs_normalizer' in checkpoint and self._obs_normalizer is not None:
            self._obs_normalizer.load_state_dict(checkpoint['obs_normalizer'])
        if 'reward_normalizer' in checkpoint and self._reward_normalizer is not None:
            self._reward_normalizer.load_state_dict(
                checkpoint['reward_normalizer'])

    def _act(self,
             fake_agent,
             observation,
             deterministic,
             need_norm=True,
             **kwargs):
        # this method used ONLY inside base class '._test_agent_service()' method
        # 'fake_agent' arg is unused to make this method work in BaseTrainer.test_agent_service
        if need_norm:
            observation = self._normalize_observation(observation, False)
        return super()._act(self._agent_online, observation, deterministic)

    def _normalize_observation(self, observation, training):
        if self._obs_normalizer is not None:
            if training:
                self._obs_normalizer.update(observation)
            mean, var = self._obs_normalizer.mean, self._obs_normalizer.var
            observation = (observation - mean) / np.maximum(np.sqrt(var), 1e-6)
            observation = np.clip(observation, -self._obs_clip, self._obs_clip)
        return observation

    def _normalize_reward(self, reward, training):
        # 'baselines' version:
        if self._reward_scaler is not None:
            if training:
                self._reward_scaler.update(self._env_discounted_return)
            var = self._reward_scaler.var
            reward = reward / np.maximum(np.sqrt(var), 1e-6)
            reward = np.clip(reward, -self._reward_clip, self._reward_clip)

        # 'my' version:
        if self._reward_normalizer is not None:
            if training:
                self._reward_normalizer.update(reward)
            mean, var = self._reward_normalizer.mean, self._reward_normalizer.var
            reward = (reward - mean) / np.maximum(np.sqrt(var), 1e-6)
            reward = np.clip(reward, -self._reward_clip, self._reward_clip)

        return reward

    def _call_after_env_step(self, reward, observation):
        # update running statistics and normalize reward & observation
        self._env_total_reward += reward
        self._env_episode_len += 1
        self._env_discounted_return = reward + self._gamma * self._env_discounted_return

        observation = self._normalize_observation(
            observation, training=self._train_obs_normalizer)
        reward = self._normalize_reward(reward,
                                        training=self._train_reward_normalizer)
        return reward, observation

    @time_it
    def _gather_rollout(self, observation, rollout_len):
        # this function is called only when agent is training
        # initial observation (i.e. at the beginning of training) does not care about normalization
        raw_rewards = []
        observations, rewards, is_done = [observation], [], []
        act_results = []  # actions, values, log-probs, policy goes here

        mean_act_time = 0
        mean_env_time = 0

        for _ in range(rollout_len):
            act_result, act_time = self._act(None,
                                             observation,
                                             deterministic=False,
                                             need_norm=False)
            action = act_result['action']
            mean_act_time += act_time

            env_step_result, env_time = self._env_step(self._train_env, action)
            observation, reward, done, _ = env_step_result
            mean_env_time += env_time

            raw_rewards.append(np.copy(reward))

            reward, observation = self._call_after_env_step(
                reward, observation)

            observations.append(observation)
            rewards.append(reward)
            is_done.append(done)

            act_results.append(act_result)

            self._done_callback(done)

        mean_act_time /= rollout_len
        mean_env_time /= rollout_len

        # act_results here is a list of dicts, convert it to dict of 'numpy.array'
        act_results = {
            k: np.stack([x[k] for x in act_results])
            for k in act_results[0].keys()
        }
        act_results['actions'] = act_results.pop('action')

        rollout = {
            'observations': observations,
            'rewards': rewards,
            'is_done': is_done,
            **act_results
        }
        gather_result = rollout, raw_rewards, observation
        mean_time = mean_act_time, mean_env_time
        return gather_result, mean_time

    def _small_done_callback(self, i):
        self._writer.add_scalars('agents/train_reward/',
                                 {f'agent_{i}': self._env_total_reward[i]},
                                 self._env_episode_number[i])
        self._writer.add_scalars('agents/train_ep_len/',
                                 {f'agent_{i}': self._env_episode_len[i]},
                                 self._env_episode_number[i])
        self._env_total_reward[i] = 0.0
        self._env_episode_len[i] = 0
        self._env_discounted_return[i] = 0.0
        self._env_episode_number[i] += 1

    def _done_callback(self, done):
        if np.any(done):
            for i, d in enumerate(done):
                if d:
                    self._small_done_callback(i)

    def _train_step(self, observation, rollout_len, step):
        # gather rollout -> train on it -> write training logs
        (gather_result, mean_time), gather_time = self._gather_rollout(
            observation, rollout_len)

        rollout, raw_rewards, observation = gather_result
        mean_act_time, mean_env_time = mean_time

        train_logs, time_logs = self._agent_train.train_on_rollout(rollout)
        train_logs['reward_mean'] = np.mean(raw_rewards)
        train_logs['reward_std'] = np.std(raw_rewards)

        time_logs['mean_act_time'] = mean_act_time
        time_logs['mean_env_time'] = mean_env_time
        time_logs['gather_rollout_time'] = gather_time

        self._write_logs('train/', train_logs, step)
        self._write_logs('time/', time_logs, step)
        return observation

    def _update_online_agent(self):
        self._agent_online.load_state_dict(self._agent_train.state_dict())

    def train(self, n_epoch, n_steps_per_epoch, rollout_len,
              n_tests_per_epoch):
        """
        Run training for 'n_epoch', each epoch takes 'n_steps' training steps
        on rollouts of len 'rollout_len'.
        At the end of each epoch run 'n_tests' tests and saves checkpoint

        :param n_epoch:
        :param n_steps_per_epoch:
        :param rollout_len:
        :param n_tests_per_epoch:
        :return:
        """
        observation = self._train_env.reset()
        self._save_n_test(0, n_tests_per_epoch, self._agent_online)

        self._agent_train.train()  # always in training mode
        for epoch in range(n_epoch):
            self._agent_online.train()
            p_bar = trange(n_steps_per_epoch, ncols=90, desc=f'epoch_{epoch}')
            for train_step in p_bar:
                step = train_step + epoch * n_steps_per_epoch
                observation = self._train_step(observation, rollout_len, step)
                if step > self._warm_up_steps and (
                        step + 1) % self._update_period == 0:
                    self._update_online_agent()

            self._update_online_agent()
            self._save_n_test(epoch + 1, n_tests_per_epoch, self._agent_online)
        self._writer.close()