예제 #1
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        if self._test_sampler is None:
            self._test_sampler = LocalSampler.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_episode_length=self._max_episode_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=self._test_task_sampler.sample(1))
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr,
                test_rollouts_per_task * self._max_episode_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                TrajectoryBatch.concatenate(*adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1
예제 #2
0
    def to_batch(self):
        """Convert this in-progress trajectory into a TrajectoryBatch.

        Returns:
            TrajectoryBatch: This trajectory as a batch.

        Raises:
            AssertionError: If this trajectory contains no time steps.

        """
        assert len(self.rewards) > 0
        env_infos = dict(self.env_infos)
        agent_infos = dict(self.agent_infos)
        for k, v in env_infos.items():
            env_infos[k] = np.asarray(v)
        for k, v in agent_infos.items():
            agent_infos[k] = np.asarray(v)
        return TrajectoryBatch(env_spec=self.env.spec,
                               observations=np.asarray(self.observations[:-1]),
                               last_observations=np.asarray([self.last_obs]),
                               actions=np.asarray(self.actions),
                               rewards=np.asarray(self.rewards),
                               terminals=np.asarray(self.terminals),
                               env_infos=env_infos,
                               agent_infos=agent_infos,
                               lengths=np.asarray([len(self.rewards)],
                                                  dtype='l'))
예제 #3
0
    def _obtain_evaluation_samples(self,
                                   env,
                                   num_trajs=100,
                                   max_path_length=1000):
        """Sample the policy for 10 trajectories and return average values.

        Args:
            env (garage.envs.GarageEnv): The environement used to obtain
                trajectories.
            num_trajs (int): Number of trajectories.
            max_path_length (int): Number of maximum steps in one batch.

        Returns:
            TrajectoryBatch: Evaluation trajectories, representing the best
                current performance of the algorithm.

        """
        paths = []

        for _ in range(num_trajs):
            path = rollout(env,
                           self.policy,
                           max_path_length=max_path_length,
                           deterministic=True)
            paths.append(path)
        return TrajectoryBatch.from_trajectory_list(self.env_spec, paths)
예제 #4
0
    def obtain_exact_trajectories(self,
                                  n_traj_per_worker,
                                  agent_update,
                                  env_update=None):
        """Sample an exact number of trajectories per worker.

        Args:
            n_traj_per_worker (int): Exact number of trajectories to gather for
                each worker.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            TrajectoryBatch: Batch of gathered trajectories. Always in worker
                order. In other words, first all trajectories from worker 0,
                then all trajectories from worker 1, etc.

        """
        self._update_workers(agent_update, env_update)
        batches = []
        for worker in self._workers:
            for _ in range(n_traj_per_worker):
                batch = worker.rollout()
                batches.append(batch)
        return TrajectoryBatch.concatenate(*batches)
예제 #5
0
    def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
        """Collect at least a given number transitions (timesteps).

        Args:
            itr(int): The current iteration number. Using this argument is
                deprecated.
            num_samples(int): Minimum number of transitions / timesteps to
                sample.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            garage.TrajectoryBatch: The batch of collected trajectories.

        """
        self._update_workers(agent_update, env_update)
        batches = []
        completed_samples = 0
        while True:
            for worker in self._workers:
                batch = worker.rollout()
                completed_samples += len(batch.actions)
                batches.append(batch)
                if completed_samples > num_samples:
                    return TrajectoryBatch.concatenate(*batches)
예제 #6
0
 def _gather_rollout(self, rollout_number, last_observation):
     assert 0 < self._path_lengths[
         rollout_number] <= self._max_episode_length
     env_infos = self._env_infos[rollout_number]
     agent_infos = self._agent_infos[rollout_number]
     for k, v in env_infos.items():
         env_infos[k] = np.asarray(v)
     for k, v in agent_infos.items():
         agent_infos[k] = np.asarray(v)
     traj = TrajectoryBatch(
         env_spec=self._envs[rollout_number].spec,
         observations=np.asarray(self._observations[rollout_number]),
         last_observations=np.asarray([last_observation]),
         actions=np.asarray(self._actions[rollout_number]),
         rewards=np.asarray(self._rewards[rollout_number]),
         step_types=np.asarray(self._step_types[rollout_number],
                               dtype=StepType),
         env_infos=dict(env_infos),
         agent_infos=dict(agent_infos),
         lengths=np.asarray([self._path_lengths[rollout_number]],
                            dtype='l'))
     self._completed_rollouts.append(traj)
     self._observations[rollout_number] = []
     self._actions[rollout_number] = []
     self._rewards[rollout_number] = []
     self._step_types[rollout_number] = []
     self._path_lengths[rollout_number] = 0
     self._prev_obs[rollout_number] = self._envs[rollout_number].reset()
     self._env_infos[rollout_number] = collections.defaultdict(list)
     self._agent_infos[rollout_number] = collections.defaultdict(list)
예제 #7
0
    def _evaluate_policy(self, epoch):
        """Evaluate the performance of the policy via deterministic rollouts.

            Statistics such as (average) discounted return and success rate are
            recorded.

        Args:
            epoch (int): The current training epoch.

        Returns:
            float: The average return across self._num_evaluation_trajectories
                trajectories

        """
        eval_trajs = []
        for _ in range(self._num_tasks):
            eval_trajs.append(
                obtain_evaluation_samples(
                    self.policy,
                    self._eval_env,
                    num_trajs=self._num_evaluation_trajectories))
        eval_trajs = TrajectoryBatch.concatenate(*eval_trajs)
        last_return = log_multitask_performance(epoch, eval_trajs,
                                                self._discount)
        return last_return
예제 #8
0
    def collect_rollout(self):
        """Collect the current rollout, clearing the internal buffer.

        Returns:
            garage.TrajectoryBatch: A batch of the trajectories completed since
                the last call to collect_rollout().

        """
        observations = self._observations
        self._observations = []
        last_observations = self._last_observations
        self._last_observations = []
        actions = self._actions
        self._actions = []
        rewards = self._rewards
        self._rewards = []
        terminals = self._terminals
        self._terminals = []
        env_infos = self._env_infos
        self._env_infos = defaultdict(list)
        agent_infos = self._agent_infos
        self._agent_infos = defaultdict(list)
        for k, v in agent_infos.items():
            agent_infos[k] = np.asarray(v)
        for k, v in env_infos.items():
            env_infos[k] = np.asarray(v)
        lengths = self._lengths
        self._lengths = []
        return TrajectoryBatch(self.env.spec, np.asarray(observations),
                               np.asarray(last_observations),
                               np.asarray(actions), np.asarray(rewards),
                               np.asarray(terminals), dict(env_infos),
                               dict(agent_infos), np.asarray(lengths,
                                                             dtype='i'))
예제 #9
0
def test_agent_infos_batch_mismatch_traj(traj_data):
    with pytest.raises(
            ValueError,
            match='entry in agent_infos must have a batch dimension'):
        traj_data['agent_infos']['hidden'] = traj_data['agent_infos'][
            'hidden'][:-1]
        t = TrajectoryBatch(**traj_data)
        del t
예제 #10
0
def test_act_box_env_spec_mismatch_traj(traj_data):
    with pytest.raises(ValueError, match='actions should have'):
        traj_data['env_spec'].action_space = akro.Box(low=1,
                                                      high=np.inf,
                                                      shape=(4, 3, 2),
                                                      dtype=np.float32)
        t = TrajectoryBatch(**traj_data)
        del t
예제 #11
0
def test_to_trajectory_list(traj_data):
    t = TrajectoryBatch(**traj_data)
    t_list = t.to_trajectory_list()
    assert len(t_list) == len(traj_data['lengths'])
    start = 0
    for length, last_obs, s in zip(traj_data['lengths'],
                                   traj_data['last_observations'], t_list):
        stop = start + length
        assert (
            s['observations'] == traj_data['observations'][start:stop]).all()
        assert (s['next_observations'] == np.concatenate(
            (traj_data['observations'][start + 1:stop], [last_obs]))).all()
        assert (s['actions'] == traj_data['actions'][start:stop]).all()
        assert (s['rewards'] == traj_data['rewards'][start:stop]).all()
        assert (s['step_types'] == traj_data['step_types'][start:stop]).all()
        start = stop
    assert start == len(traj_data['rewards'])
예제 #12
0
def test_time_step_batch_from_trajectory_batch(traj_data):
    traj = TrajectoryBatch(**traj_data)
    timestep_batch = TimeStepBatch.from_trajectory_batch(traj)
    assert (timestep_batch.observations == traj.observations).all()
    assert (timestep_batch.next_observations[:traj.lengths[0] - 1] ==
            traj.observations[1:traj.lengths[0]]).all()
    assert (timestep_batch.next_observations[traj.lengths[0]] ==
            traj.last_observations[0]).all()
예제 #13
0
파일: cma_es.py 프로젝트: songanz/garage
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        # -- Stage: Calculate baseline
        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_episode_length,
                                        baseline_predictions, self._discount)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        samples_data['average_return'] = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = samples_data['average_return']
        self._all_returns.append(samples_data['average_return'])

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
예제 #14
0
파일: te.py 프로젝트: songanz/garage
    def collect_rollout(self):
        """Collect the current rollout, clearing the internal buffer.

        One-hot task id is saved in env_infos['task_onehot']. Latent is saved
        in agent_infos['latent']. Latent infos are saved in
        agent_infos['latent_info_name'], where info_name is the original latent
        info name.

        Returns:
            garage.TrajectoryBatch: A batch of the trajectories completed since
                the last call to collect_rollout().

        """
        observations = self._observations
        self._observations = []
        last_observations = self._last_observations
        self._last_observations = []
        actions = self._actions
        self._actions = []
        rewards = self._rewards
        self._rewards = []
        step_types = self._step_types
        self._step_types = []
        latents = self._latents
        self._latents = []
        tasks = self._tasks
        self._tasks = []
        env_infos = self._env_infos
        self._env_infos = defaultdict(list)
        agent_infos = self._agent_infos
        self._agent_infos = defaultdict(list)
        latent_infos = self._latent_infos
        self._latent_infos = defaultdict(list)
        for k, v in latent_infos.items():
            latent_infos[k] = np.asarray(v)
        for k, v in agent_infos.items():
            agent_infos[k] = np.asarray(v)
        for k, v in env_infos.items():
            env_infos[k] = np.asarray(v)
        env_infos['task_onehot'] = np.asarray(tasks)
        agent_infos['latent'] = np.asarray(latents)
        for k, v in latent_infos.items():
            agent_infos['latent_{}'.format(k)] = v
        lengths = self._lengths
        self._lengths = []

        return TrajectoryBatch(env_spec=self.env.spec,
                               observations=np.asarray(observations),
                               last_observations=np.asarray(last_observations),
                               actions=np.asarray(actions),
                               rewards=np.asarray(rewards),
                               step_types=np.asarray(step_types,
                                                     dtype=StepType),
                               env_infos=(env_infos),
                               agent_infos=dict(agent_infos),
                               lengths=np.asarray(lengths, dtype='i'))
예제 #15
0
def test_new(traj_data):
    t = TrajectoryBatch(**traj_data)
    assert t.env_spec is traj_data['env_spec']
    assert t.observations is traj_data['observations']
    assert t.actions is traj_data['actions']
    assert t.rewards is traj_data['rewards']
    assert t.terminals is traj_data['terminals']
    assert t.env_infos is traj_data['env_infos']
    assert t.agent_infos is traj_data['agent_infos']
    assert t.lengths is traj_data['lengths']
예제 #16
0
def test_new_traj(traj_data):
    t = TrajectoryBatch(**traj_data)
    assert t.env_spec is traj_data['env_spec']
    assert t.observations is traj_data['observations']
    assert t.last_observations is traj_data['last_observations']
    assert t.actions is traj_data['actions']
    assert t.rewards is traj_data['rewards']
    assert t.env_infos is traj_data['env_infos']
    assert t.agent_infos is traj_data['agent_infos']
    assert t.step_types is traj_data['step_types']
    assert t.lengths is traj_data['lengths']
예제 #17
0
def test_log_multitask_performance_task_id():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
                           dtype=bool),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool),
            'task_id':
            np.array([1] * 10 + [3] * 5 + [1] + [4])
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_multitask_performance(7, batch, 0.8, {
        1: 'env1',
        3: 'env2',
        4: 'env3',
        5: 'env4'
    })
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['env1/Iteration'] == 7
    assert res['env2/Iteration'] == 7
    assert res['env3/Iteration'] == 7
    assert res['env4/Iteration'] == 7
    assert res['env1/NumTrajs'] == 2
    assert res['env2/NumTrajs'] == 1
    assert res['env3/NumTrajs'] == 1
    assert res['env4/NumTrajs'] == 0
    assert math.isclose(res['env1/SuccessRate'], 0.5)
    assert math.isclose(res['env2/SuccessRate'], 1.0)
    assert math.isclose(res['env3/SuccessRate'], 1.0)
    assert math.isnan(res['env4/SuccessRate'])
    assert math.isnan(res['env4/AverageReturn'])
예제 #18
0
파일: npo.py 프로젝트: seraliilhan/garage
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        # -- Stage: Calculate baseline
        paths = [
            dict(
                observations=self._env_spec.observation_space.flatten_n(
                    path['observations'])
                if self._flatten_input else path['observations'],
                actions=(
                    self._env_spec.action_space.flatten_n(  # noqa: E126
                        path['actions'])),
                rewards=path['rewards'],
                env_infos=path['env_infos'],
                agent_infos=path['agent_infos'],
                dones=path['dones']) for path in paths
        ]

        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_path_length,
                                        baseline_predictions, self._discount,
                                        self._gae_lambda)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))

        samples_data['average_return'] = np.mean(undiscounted_returns)

        self.log_diagnostics(samples_data)
        logger.log('Optimizing policy...')
        self.optimize_policy(samples_data)
        return samples_data['average_return']
예제 #19
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner.

        """
        for epoch in runner.step_epochs():
            samples = runner.obtain_samples(epoch)
            log_performance(
                epoch,
                TrajectoryBatch.from_trajectory_list(self.env_spec, samples),
                self._discount)
            self._train_once(samples)
예제 #20
0
    def collect_rollout(self):
        """Collect all completed rollouts.

        Returns:
            garage.TrajectoryBatch: A batch of the trajectories completed since
                the last call to collect_rollout().

        """
        if len(self._completed_rollouts) == 1:
            result = self._completed_rollouts[0]
        else:
            result = TrajectoryBatch.concatenate(*self._completed_rollouts)
        self._completed_rollouts = []
        return result
예제 #21
0
    def train(self, runner):
        """Get samples and train the policy.

        Args:
            runner (LocalRunner): LocalRunner.

        """
        for epoch in runner.step_epochs():
            samples = runner.obtain_samples(epoch)
            log_performance(
                epoch,
                TrajectoryBatch.from_trajectory_list(self.env_spec, samples),
                self._discount)
            self._train_once(epoch, samples)
예제 #22
0
    def evaluate(self, algo):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate.

        """
        adapted_trajectories = []
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, 1, adapted_policy)
            adapted_trajectories.append(adapted_traj)
        log_performance(self._eval_itr,
                        TrajectoryBatch.concatenate(*adapted_trajectories),
                        getattr(algo, 'discount', 1.0),
                        prefix=self._prefix)
        self._eval_itr += 1
예제 #23
0
def test_log_performance():
    lengths = np.array([10, 5, 1, 1])
    batch = TrajectoryBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        step_types=np.array(
            [StepType.FIRST] + [StepType.MID] * (lengths[0] - 2) +
            [StepType.TERMINAL] + [StepType.FIRST] + [StepType.MID] *
            (lengths[1] - 2) + [StepType.TERMINAL] + [StepType.FIRST] +
            [StepType.FIRST],
            dtype=StepType),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool)
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_performance(7, batch, 0.8, prefix='test_log_performance')
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['test_log_performance/Iteration'] == 7
    assert res['test_log_performance/NumTrajs'] == 4
    assert math.isclose(res['test_log_performance/SuccessRate'], 0.75)
    assert math.isclose(res['test_log_performance/TerminationRate'], 0.5)
    assert math.isclose(res['test_log_performance/AverageDiscountedReturn'],
                        1.1131040640673113)
    assert math.isclose(res['test_log_performance/AverageReturn'],
                        2.1659965525)
    assert math.isclose(res['test_log_performance/StdReturn'],
                        2.354067152038576)
예제 #24
0
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        obs, actions, rewards, valids, baselines = self.process_samples(
            itr, paths)

        loss = self._compute_loss(itr, obs, actions, rewards, valids,
                                  baselines)

        self._old_policy.load_state_dict(self.policy.state_dict())

        self._optimizer.zero_grad()
        loss.backward()

        kl_before = self._compute_kl_constraint(obs).detach()
        self._optimize(itr, obs, actions, rewards, valids, baselines)

        with torch.no_grad():
            loss_after = self._compute_loss(itr, obs, actions, rewards, valids,
                                            baselines)
            kl = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        average_returns = log_performance(itr,
                                          TrajectoryBatch.from_trajectory_list(
                                              self.env_spec, paths),
                                          discount=self.discount)

        with tabular.prefix(self.policy.name):
            tabular.record('LossBefore', loss.item())
            tabular.record('LossAfter', loss_after.item())
            tabular.record('dLoss', loss.item() - loss_after.item())
            tabular.record('KLBefore', kl_before.item())
            tabular.record('KL', kl.item())
            tabular.record('Entropy', policy_entropy.mean().item())

        self.baseline.fit(paths)
        return np.mean(average_returns)
예제 #25
0
    def log_performance(self, itr, all_samples, loss_before, loss_after,
                        kl_before, kl, policy_entropy):
        """Evaluate performance of this batch.

        Args:
            itr (int): Iteration number.
            all_samples (list[list[MAMLTrajectoryBatch]]): Two
                dimensional list of MAMLTrajectoryBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        tabular.record('Iteration', itr)

        name_map = None
        if hasattr(self._env, 'all_task_names'):
            names = self._env.all_task_names
            name_map = dict(zip(names, names))

        rtns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(
                env_spec=self._env.spec,
                paths=[
                    path for task_paths in all_samples
                    for path in task_paths[self._num_grad_updates].paths
                ]),
            discount=self._inner_algo.discount,
            name_map=name_map)

        with tabular.prefix(self._policy.name + '/'):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KLAfter', kl)
            tabular.record('Entropy', policy_entropy)

        return np.mean(rtns)
예제 #26
0
    def collect_rollout(self):
        """Gather fragments from all in-progress rollouts.

        Returns:
            garage.TrajectoryBatch: A batch of the trajectory fragments.

        """
        for i, frag in enumerate(self._fragments):
            assert frag.env is self._envs[i]
            if len(frag.rewards) > 0:
                complete_frag = frag.to_batch()
                self._complete_fragments.append(complete_frag)
                self._fragments[i] = InProgressTrajectory(
                    frag.env, frag.last_obs)
        assert len(self._complete_fragments) > 0
        result = TrajectoryBatch.concatenate(*self._complete_fragments)
        self._complete_fragments = []
        return result
예제 #27
0
 def _gather_rollout(self, rollout_number, last_observation):
     assert 0 < self._path_lengths[rollout_number] <= self._max_path_length
     traj = TrajectoryBatch(
         self._envs[rollout_number].spec,
         np.asarray(self._observations[rollout_number]),
         np.asarray([last_observation]),
         np.asarray(self._actions[rollout_number]),
         np.asarray(self._rewards[rollout_number]),
         np.asarray(self._terminals[rollout_number]),
         self._env_infos[rollout_number], self._agent_infos[rollout_number],
         np.asarray([self._path_lengths[rollout_number]], dtype='l'))
     self._completed_rollouts.append(traj)
     self._observations[rollout_number] = []
     self._actions[rollout_number] = []
     self._rewards[rollout_number] = []
     self._terminals[rollout_number] = []
     self._path_lengths[rollout_number] = 0
     self._prev_obs[rollout_number] = self._envs[rollout_number].reset()
예제 #28
0
 def _gather_rollout(self, rollout_number, last_observation):
     assert 0 < self._path_lengths[
         rollout_number] <= self._max_episode_length
     traj = TrajectoryBatch(
         env_spec=self._envs[rollout_number].spec,
         observations=np.asarray(self._observations[rollout_number]),
         last_observations=np.asarray([last_observation]),
         actions=np.asarray(self._actions[rollout_number]),
         rewards=np.asarray(self._rewards[rollout_number]),
         step_types=np.asarray(self._step_types[rollout_number],
                               dtype=StepType),
         env_infos=self._env_infos[rollout_number],
         agent_infos=self._agent_infos[rollout_number],
         lengths=np.asarray([self._path_lengths[rollout_number]],
                            dtype='l'))
     self._completed_rollouts.append(traj)
     self._observations[rollout_number] = []
     self._actions[rollout_number] = []
     self._rewards[rollout_number] = []
     self._step_types[rollout_number] = []
     self._path_lengths[rollout_number] = 0
     self._prev_obs[rollout_number] = self._envs[rollout_number].reset()
예제 #29
0
def slice_trajectories(trajectories, slice_size):
    sliced = []
    for traj in trajectories.split():
        splits = math.ceil(traj.lengths[0] / slice_size)
        split_indices = np.array_split(np.arange(traj.lengths[0]), splits)
        next_obs = traj.next_observations
        for indices in split_indices:
            last_obs = np.asarray([next_obs[indices[-1]]])
            t = TrajectoryBatch(
                env_spec=traj.env_spec,
                observations=traj.observations[indices],
                last_observations=last_obs,
                actions=traj.actions[indices],
                rewards=traj.rewards[indices],
                step_types=traj.step_types[indices],
                env_infos={k: v[indices]
                           for (k, v) in traj.env_infos},
                agent_infos={k: v[indices]
                             for (k, v) in traj.agent_infos},
                lengths=np.asarray([len(indices)], dtype='l'))
            sliced.append(t)
    return sliced
예제 #30
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)

        samples_data = self.paths_to_tensors(paths)

        samples_data['average_return'] = np.mean(undiscounted_returns)

        logger.log('Optimizing policy...')
        self.optimize_policy(itr, samples_data)
        return samples_data['average_return']