예제 #1
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """
        last_return = None

        for _ in runner.step_epochs():
            for cycle in range(self.steps_per_epoch):
                runner.step_path = runner.obtain_samples(runner.step_itr)
                for path in runner.step_path:
                    path['rewards'] *= self.reward_scale
                last_return = self.train_once(runner.step_itr,
                                              runner.step_path)
                if cycle == 0 and self.evaluate:
                    log_performance(runner.step_itr,
                                    self._obtain_evaluation_samples(
                                        runner.get_env_copy()),
                                    discount=self.discount)
                    tabular.record('TotalEnvSteps', runner.total_env_steps)
                runner.step_itr += 1

        return last_return
예제 #2
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        # -- Stage: Calculate baseline
        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_path_length,
                                        baseline_predictions, self._discount)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        samples_data['average_return'] = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = samples_data['average_return']
        self._all_returns.append(samples_data['average_return'])

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
예제 #3
0
def test_log_performance():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
                           dtype=bool),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool)
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_performance(7, batch, 0.8, prefix='test_log_performance')
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['test_log_performance/Iteration'] == 7
    assert res['test_log_performance/NumTrajs'] == 4
    assert math.isclose(res['test_log_performance/SuccessRate'], 0.75)
    assert math.isclose(res['test_log_performance/CompletionRate'], 0.5)
    assert math.isclose(res['test_log_performance/AverageDiscountedReturn'],
                        1.1131040640673113)
    assert math.isclose(res['test_log_performance/AverageReturn'],
                        2.1659965525)
    assert math.isclose(res['test_log_performance/StdReturn'],
                        2.354067152038576)
예제 #4
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        # -- Stage: Calculate baseline
        paths = [
            dict(
                observations=self._env_spec.observation_space.flatten_n(
                    path['observations'])
                if self._flatten_input else path['observations'],
                actions=(
                    self._env_spec.action_space.flatten_n(  # noqa: E126
                        path['actions'])),
                rewards=path['rewards'],
                env_infos=path['env_infos'],
                agent_infos=path['agent_infos'],
                dones=path['dones']) for path in paths
        ]

        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_path_length,
                                        baseline_predictions, self._discount,
                                        self._gae_lambda)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))

        samples_data['average_return'] = np.mean(undiscounted_returns)

        self.log_diagnostics(samples_data)
        logger.log('Optimizing policy...')
        self.optimize_policy(samples_data)
        return samples_data['average_return']
예제 #5
0
파일: sac.py 프로젝트: seba-1511/metarl
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """

        for _ in runner.step_epochs():
            if self.replay_buffer.n_transitions_stored < self.min_buffer_size:
                batch_size = self.min_buffer_size
            else:
                batch_size = None
            runner.step_path = runner.obtain_samples(runner.step_itr, batch_size)
            for sample in runner.step_path:
                self.replay_buffer.store(obs=sample.observation,
                                        act=sample.action,
                                        rew=sample.reward,
                                        next_obs=sample.next_observation,
                                        done=sample.terminal)
            self.episode_rewards.append(sum([sample.reward for sample in runner.step_path]))
            for _ in range(self.gradient_steps):
                last_return, policy_loss, qf1_loss, qf2_loss = self.train_once(runner.step_itr,
                                              runner.step_path)
            log_performance(
                runner.step_itr,
                self._obtain_evaluation_samples(runner.get_env_copy(), num_trajs=10),
                discount=self.discount)
            self.log_statistics(policy_loss, qf1_loss, qf2_loss)
            tabular.record('TotalEnvSteps', runner.total_env_steps)
            runner.step_itr += 1

        return last_return
예제 #6
0
    def _evaluate_policy(self, epoch):
        """Evaluate the performance of the policy via deterministic rollouts.

            Statistics such as (average) discounted return and success rate are
            recorded.

        Args:
            epoch(int): The current training epoch.

        Returns:
            float: The average return across self._num_evaluation_trajectories
                trajectories

        """
        eval_trajectories = self._obtain_evaluation_samples(
            self._eval_env, num_trajs=self._num_evaluation_trajectories)
        last_return = log_performance(epoch,
                                      eval_trajectories,
                                      discount=self.discount)
        return last_return
예제 #7
0
파일: vpg.py 프로젝트: seba-1511/metarl
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        batch_size = (self._training_batch_size if self._training_batch_size
                      else len(paths))
        samples = self.process_samples(itr, paths)

        for _ in range(self._training_epochs):
            minibatch_ids_list = torch.randperm(len(paths)).split(batch_size)

            for minibatch_ids in minibatch_ids_list:
                obs, actions, rewards, valids, baselines = self._get_minibatch(samples, minibatch_ids)

                loss = self._compute_loss(itr, obs, actions, rewards, valids,
                                          baselines)

                self._old_policy.load_state_dict(self.policy.state_dict())

                self._optimizer.zero_grad()
                loss.backward()
                self._optimize(itr, obs, actions, rewards, valids, baselines)

                self.baseline.fit(paths)

        average_returns = log_performance(itr,
                                          TrajectoryBatch.from_trajectory_list(
                                              self.env_spec, paths),
                                          discount=self.discount)

        return np.mean(average_returns)
예제 #8
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        Returns:
            float: The average return in last epoch cycle.

        """
        last_return = None

        for _ in runner.step_epochs():
            for cycle in range(self.epoch_cycles):
                if self.replay_buffer.n_transitions_stored < self.min_buffer_size:
                    batch_size = self.min_buffer_size
                else:
                    batch_size = None
                runner.step_path = runner.obtain_samples(
                    runner.step_itr, batch_size)
                for sample in runner.step_path:
                    self.replay_buffer.store(obs=sample.observation,
                                             act=sample.action,
                                             rew=sample.reward,
                                             next_obs=sample.next_observation,
                                             done=sample.terminal)
                for _ in range(self.gradient_steps):
                    last_return, policy_loss, qf1_loss, qf2_loss = self.train_once(
                        runner.step_itr, runner.step_path)
                if cycle == self.epoch_cycles - 1:
                    self.episode_rewards.append(
                        sum([sample.reward for sample in runner.step_path]))

            # evaluation
            epoch_local_success_rate = []
            for task_number, name in enumerate(self.env.task_names_ordered):
                eval_env = self.eval_env_dict[name]
                _, avg_success_rate = log_performance(
                    runner.step_itr,
                    self._obtain_evaluation_samples(
                        MTEnvEvalWrapper(eval_env, task_number,
                                         self._num_tasks,
                                         self.env._max_plain_dim),
                        num_trajs=self.num_eval_paths),
                    discount=self.discount,
                    prefix=name)

            epoch_local_success_rate.append(avg_success_rate)
            self.epoch_mean_success_rate.append(
                np.mean(epoch_local_success_rate))
            self.epoch_median_success_rate.append(
                np.median(epoch_local_success_rate))

            tabular.record('local/Mean_SuccessRate',
                           self.epoch_mean_success_rate[-1])
            tabular.record('local/Median_SuccessRate',
                           self.epoch_median_success_rate[-1])
            tabular.record('local/Max_Median_SuccessRate',
                           np.max(self.epoch_median_success_rate))
            tabular.record('local/Max_Mean_SuccessRate',
                           np.max(self.epoch_mean_success_rate))

            self.log_statistics(policy_loss, qf1_loss, qf2_loss)
            tabular.record('TotalEnvSteps', runner.total_env_steps)
        runner.step_itr += 1

        return last_return
예제 #9
0
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Calculated mean value of undiscounted returns.

        """
        obs, actions, rewards, returns, valids, baselines = \
            self.process_samples(paths)

        if self._maximum_entropy:
            policy_entropies = self._compute_policy_entropy(obs)
            rewards += self._policy_ent_coeff * policy_entropies

        obs_flat = torch.cat(filter_valids(obs, valids))
        actions_flat = torch.cat(filter_valids(actions, valids))
        rewards_flat = torch.cat(filter_valids(rewards, valids))
        returns_flat = torch.cat(filter_valids(returns, valids))
        advs_flat = self._compute_advantage(rewards, valids, baselines)

        with torch.no_grad():
            policy_loss_before = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_before = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_before = self._compute_kl_constraint(obs)

        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
                    advs_flat)

        with torch.no_grad():
            policy_loss_after = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_after = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_after = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        with tabular.prefix(self.policy.name):
            tabular.record('/LossBefore', policy_loss_before.item())
            tabular.record('/LossAfter', policy_loss_after.item())
            tabular.record('/dLoss',
                           (policy_loss_before - policy_loss_after).item())
            tabular.record('/KLBefore', kl_before.item())
            tabular.record('/KL', kl_after.item())
            tabular.record('/Entropy', policy_entropy.mean().item())

        with tabular.prefix(self._value_function.name):
            tabular.record('/LossBefore', vf_loss_before.item())
            tabular.record('/LossAfter', vf_loss_after.item())
            tabular.record('/dLoss',
                           vf_loss_before.item() - vf_loss_after.item())

        self._old_policy.load_state_dict(self.policy.state_dict())

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self.discount)
        return np.mean(undiscounted_returns)
예제 #10
0
    def log_performance(self, indices, test, epoch):
        """Get average returns for specific tasks.

        Args:
            indices (list): List of tasks.

        """
        discounted_returns = []
        undiscounted_returns = []
        completion = []
        success = []
        traj = []
        for idx in indices:
            eval_paths = []
            for _ in range(self._num_evals):
                paths = self.collect_paths(idx, test)
                paths[-1]['terminals'] = paths[-1]['terminals'].squeeze()
                paths[-1]['dones'] = paths[-1]['terminals']
                # HalfCheetahVel env
                if 'task' in paths[-1]['env_infos'].keys():
                    paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][
                        'task']['velocity']
                eval_paths.append(paths[-1])
                discounted_returns.append(
                    discount_cumsum(paths[-1]['rewards'], self._discount))
                undiscounted_returns.append(sum(paths[-1]['rewards']))
                completion.append(float(paths[-1]['terminals'].any()))
                # calculate success rate for metaworld tasks
                if 'success' in paths[-1]['env_infos']:
                    success.append(paths[-1]['env_infos']['success'].any())

            if test:
                env = self.test_env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            else:
                env = self.env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            traj.append(temp_traj)

        if test:
            with tabular.prefix('Test/'):
                if self._test_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._test_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
        else:
            with tabular.prefix('Train/'):
                if self._train_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._train_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
예제 #11
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data