예제 #1
0
    def obtain_samples(self, itr, num_samples):
        """Sample the policy for new trajectories.

        Args:
            - itr(int): iteration number
            - num_samples(int):number of steps the the sampler should collect
        """
        self._active_workers = []
        self._active_worker_ids = []
        pbar = ProgBarCounter(num_samples)
        completed_samples = 0
        traj = []
        updating_workers = []

        # update the policy params of each worker before sampling
        # for the current iteration
        self._idle_worker_ids = list(range(self._num_workers))
        curr_policy_params = self._algo.policy.get_param_values()
        params_id = ray.put(curr_policy_params)
        while self._idle_worker_ids:
            worker_id = self._idle_worker_ids.pop()
            worker = self._all_workers[worker_id]
            updating_workers.append(worker.set_agent.remote(params_id))

        while completed_samples < num_samples:
            # if there are workers still being updated, check
            # which ones are still updating and take the workers that
            # are done updating, and start collecting trajectories on
            # those workers.
            if updating_workers:
                updated, updating_workers = ray.wait(updating_workers,
                                                     num_returns=1,
                                                     timeout=0.1)
                upd = [ray.get(up) for up in updated]
                self._idle_worker_ids.extend(upd)

            # if there are idle workers, use them to collect trajectories
            # mark the newly busy workers as active
            while self._idle_worker_ids:
                idle_worker_id = self._idle_worker_ids.pop()
                self._active_worker_ids.append(idle_worker_id)
                worker = self._all_workers[idle_worker_id]
                self._active_workers.append(worker.rollout.remote())

            # check which workers are done/not done collecting a sample
            # if any are done, send them to process the collected trajectory
            # if they are not, keep checking if they are done
            ready, not_ready = ray.wait(self._active_workers,
                                        num_returns=1,
                                        timeout=0.001)
            self._active_workers = not_ready
            for result in ready:
                trajectory, num_returned_samples = self._process_trajectory(
                    result)
                completed_samples += num_returned_samples
                pbar.inc(num_returned_samples)
                traj.append(trajectory)
        pbar.stop()
        return traj
예제 #2
0
파일: smac_wrapper.py 프로젝트: sisl/DICG
    def eval(self, policy, n_episodes=20, greedy=True, load_from_file=False,
             save_replay=False):
        if load_from_file:
            logger.add_output(dowel.StdOutput())
        logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format(
            n_episodes, greedy))

        n_won = 0
        episode_rewards = []
        pbar = ProgBarCounter(n_episodes)
        for e in range(n_episodes):
            obs = self.reset()
            policy.reset([True])
            info = {'battle_won': False}
            terminated = False
            episode_rewards.append(0)

            while not terminated:
                obs = np.array([obs]) # add [.] for vec_env
                avail_actions = np.array([self.get_avail_actions()])
                actions, agent_infos = policy.get_actions(obs, 
                    avail_actions, greedy=greedy)
                obs, reward, terminated, info = self.step(actions[0])
                if not self.centralized:
                    terminated = all(terminated)
                episode_rewards[-1] += np.mean(reward)
            pbar.inc(1)
            if save_replay:
                self.save_replay()

            # If case SC2 restarts during eval, KeyError: 'battle_won' can happen
            # Take precaution
            if type(info) == dict: 
                if 'battle_won' in info.keys():
                    n_won += 1 if info['battle_won'] else 0

        pbar.stop()
        policy.reset([True])
        win_rate = n_won / n_episodes
        avg_return = np.mean(episode_rewards)

        logger.log('EvalWinRate: {}'.format(win_rate))
        logger.log('EvalAvgReturn: {}'.format(avg_return))
        if not load_from_file:
            tabular.record('EvalWinRate', win_rate)
            tabular.record('EvalAvgReturn', avg_return)
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Obtain samples."""
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]['observations']),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]['actions']),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
예제 #4
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths, each path with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self._vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    obs = np.asarray(running_paths[idx]['observations'])
                    actions = np.asarray(running_paths[idx]['actions'])
                    paths.append(
                        dict(observations=obs,
                             actions=actions,
                             rewards=np.asarray(running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #5
0
    def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
        """Sample the policy for new trajectories.

        Args:
            itr(int): Iteration number.
            num_samples(int): Number of steps the the sampler should collect.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            list[dict]: Sample paths, each path with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)

        """
        active_workers = []
        active_worker_ids = []
        idle_worker_ids = list(range(self._num_workers))
        pbar = ProgBarCounter(num_samples)
        completed_samples = 0
        traj = []
        updating_workers = []

        # update the policy params of each worker before sampling
        # for the current iteration
        idle_worker_ids = list(range(self._num_workers))
        param_ids = self._worker_factory.prepare_worker_messages(
            agent_update, ray.put)
        env_ids = self._worker_factory.prepare_worker_messages(
            env_update, ray.put)
        while idle_worker_ids:
            worker_id = idle_worker_ids.pop()
            worker = self._all_workers[worker_id]
            updating_workers.append(
                worker.update.remote(param_ids[worker_id], env_ids[worker_id]))

        while completed_samples < num_samples:
            # if there are workers still being updated, check
            # which ones are still updating and take the workers that
            # are done updating, and start collecting trajectories on
            # those workers.
            if updating_workers:
                updated, updating_workers = ray.wait(updating_workers,
                                                     num_returns=1,
                                                     timeout=0.1)
                upd = [ray.get(up) for up in updated]
                idle_worker_ids.extend(upd)

            # if there are idle workers, use them to collect trajectories
            # mark the newly busy workers as active
            while idle_worker_ids:
                idle_worker_id = idle_worker_ids.pop()
                active_worker_ids.append(idle_worker_id)
                worker = self._all_workers[idle_worker_id]
                active_workers.append(worker.rollout.remote())

            # check which workers are done/not done collecting a sample
            # if any are done, send them to process the collected trajectory
            # if they are not, keep checking if they are done
            ready, not_ready = ray.wait(active_workers,
                                        num_returns=1,
                                        timeout=0.001)
            active_workers = not_ready
            for result in ready:
                trajectory, num_returned_samples = _process_trajectory(
                    result, active_worker_ids, idle_worker_ids)
                completed_samples += num_returned_samples
                pbar.inc(num_returned_samples)
                traj.append(trajectory)
        pbar.stop()
        return traj
예제 #6
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths.

        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape [Batch, *obs_dims]
                * actions: numpy.ndarray with shape [Batch, *act_dims]
                * rewards: numpy.ndarray with shape [Batch, ]
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.
                * dones: numpy.ndarray with shape [Batch, ]

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = \
                self._vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(observations=[],
                                              actions=[],
                                              rewards=[],
                                              env_infos=[],
                                              agent_infos=[],
                                              dones=[])
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                running_paths[idx]['dones'].append(done)
                if done:
                    obs = np.asarray(running_paths[idx]['observations'])
                    actions = np.asarray(running_paths[idx]['actions'])
                    paths.append(
                        dict(observations=obs,
                             actions=actions,
                             rewards=np.asarray(running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos']),
                             dones=np.asarray(running_paths[idx]['dones'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
예제 #7
0
    def run_collect(self,
                    collect_once,
                    threshold,
                    args=None,
                    show_prog_bar=True):
        """
        Run the collector method using the worker pool. The collect_once method
        will receive 'g' as its first argument, followed by the provided args,
        if any. The method should return a pair of values. The first should be
        the object to be collected, and the second is the increment to be
        added.
        This will continue until the total increment reaches or exceeds the
        given threshold.

        Sample script:

        def collect_once(g):
            return 'a', 1

        stateful_pool.run_collect(collect_once, threshold=3)
        # should return ['a', 'a', 'a']

        :param collector:
        :param threshold:
        :return:
        """
        assert not inspect.ismethod(collect_once), (
            'run_collect() cannot run a class method. Please ensure that '
            "collect_once is a function with the prototype 'def foo(g, ...)', "
            'where g is an object of type '
            'garage.sampler.stateful_pool.SharedGlobal')

        if args is None:
            args = tuple()
        if self.pool:
            counter = self.manager.Value('i', 0)
            lock = self.manager.RLock()
            results = self.pool.map_async(_worker_run_collect, [
                (collect_once, counter, lock, threshold, args)
            ] * self.n_parallel)
            if show_prog_bar:
                pbar = ProgBarCounter(threshold)
            last_value = 0
            while True:
                time.sleep(0.1)
                with lock:
                    if counter.value >= threshold:
                        if show_prog_bar:
                            pbar.stop()
                        break
                    if show_prog_bar:
                        pbar.inc(counter.value - last_value)
                    last_value = counter.value
            return sum(results.get(), [])
        else:
            count = 0
            results = []
            if show_prog_bar:
                pbar = ProgBarCounter(threshold)
            while count < threshold:
                result, inc = collect_once(self.G, *args)
                results.append(result)
                count += inc
                if show_prog_bar:
                    pbar.inc(inc)
            if show_prog_bar:
                pbar.stop()
            return results
        return []
예제 #8
0
    def obtain_exact_trajectories(self,
                                  n_traj_per_worker,
                                  agent_update,
                                  env_update=None):
        """Sample an exact number of trajectories per worker.

        Args:
            n_traj_per_worker (int): Exact number of trajectories to gather for
                each worker.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            TrajectoryBatch: Batch of gathered trajectories. Always in worker
                order. In other words, first all trajectories from worker 0,
                then all trajectories from worker 1, etc.

        """
        active_workers = []
        pbar = ProgBarCounter(self._worker_factory.n_workers)
        trajectories = defaultdict(list)

        # update the policy params of each worker before sampling
        # for the current iteration
        idle_worker_ids = []
        updating_workers = self._update_workers(agent_update, env_update)

        while any(
                len(trajectories[i]) < n_traj_per_worker
                for i in range(self._worker_factory.n_workers)):
            # if there are workers still being updated, check
            # which ones are still updating and take the workers that
            # are done updating, and start collecting trajectories on
            # those workers.
            if updating_workers:
                updated, updating_workers = ray.wait(updating_workers,
                                                     num_returns=1,
                                                     timeout=0.1)
                upd = [ray.get(up) for up in updated]
                idle_worker_ids.extend(upd)

            # if there are idle workers, use them to collect trajectories
            # mark the newly busy workers as active
            while idle_worker_ids:
                idle_worker_id = idle_worker_ids.pop()
                worker = self._all_workers[idle_worker_id]
                active_workers.append(worker.rollout.remote())

            # check which workers are done/not done collecting a sample
            # if any are done, send them to process the collected trajectory
            # if they are not, keep checking if they are done
            ready, not_ready = ray.wait(active_workers,
                                        num_returns=1,
                                        timeout=0.001)
            active_workers = not_ready
            for result in ready:
                ready_worker_id, trajectory_batch = ray.get(result)
                pbar.inc(1)
                trajectories[ready_worker_id].append(trajectory_batch)
                if len(trajectories[ready_worker_id]) < n_traj_per_worker:
                    idle_worker_ids.append(ready_worker_id)
        pbar.stop()
        ordered_trajectories = list(
            itertools.chain(*[
                trajectories[i] for i in range(self._worker_factory.n_workers)
            ]))
        return TrajectoryBatch.concatenate(*ordered_trajectories)
예제 #9
0
    def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
        """Sample the policy for new trajectories.

        Args:
            itr(int): Iteration number.
            num_samples(int): Number of steps the the sampler should collect.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            TrajectoryBatch: Batch of gathered trajectories.

        """
        active_workers = []
        pbar = ProgBarCounter(num_samples)
        completed_samples = 0
        batches = []

        # update the policy params of each worker before sampling
        # for the current iteration
        idle_worker_ids = []
        updating_workers = self._update_workers(agent_update, env_update)

        while completed_samples < num_samples:
            # if there are workers still being updated, check
            # which ones are still updating and take the workers that
            # are done updating, and start collecting trajectories on
            # those workers.
            if updating_workers:
                updated, updating_workers = ray.wait(updating_workers,
                                                     num_returns=1,
                                                     timeout=0.1)
                upd = [ray.get(up) for up in updated]
                idle_worker_ids.extend(upd)

            # if there are idle workers, use them to collect trajectories
            # mark the newly busy workers as active
            while idle_worker_ids:
                idle_worker_id = idle_worker_ids.pop()
                worker = self._all_workers[idle_worker_id]
                active_workers.append(worker.rollout.remote())

            # check which workers are done/not done collecting a sample
            # if any are done, send them to process the collected trajectory
            # if they are not, keep checking if they are done
            ready, not_ready = ray.wait(active_workers,
                                        num_returns=1,
                                        timeout=0.001)
            active_workers = not_ready
            for result in ready:
                ready_worker_id, trajectory_batch = ray.get(result)
                idle_worker_ids.append(ready_worker_id)
                num_returned_samples = trajectory_batch.lengths.sum()
                completed_samples += num_returned_samples
                pbar.inc(num_returned_samples)
                batches.append(trajectory_batch)
        pbar.stop()
        return TrajectoryBatch.concatenate(*batches)
예제 #10
0
    def obtain_exact_trajectories(self,
                                  n_traj_per_worker,
                                  agent_update,
                                  env_update=None):
        """Sample an exact number of trajectories per worker.

        Args:
            n_traj_per_worker (int): Exact number of trajectories to gather for
                each worker.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            TrajectoryBatch: Batch of gathered trajectories. Always in worker
                order. In other words, first all trajectories from worker 0,
                then all trajectories from worker 1, etc.

        Raises:
            AssertionError: On internal errors.

        """
        pbar = ProgBarCounter(self._factory.n_workers)
        self._agent_version += 1
        updated_workers = set()
        agent_ups = self._factory.prepare_worker_messages(
            agent_update, cloudpickle.dumps)
        env_ups = self._factory.prepare_worker_messages(env_update)
        trajectories = defaultdict(list)

        while any(
                len(trajectories[i]) < n_traj_per_worker
                for i in range(self._factory.n_workers)):
            self._push_updates(updated_workers, agent_ups, env_ups)
            tag, contents = self._to_sampler.get()
            if tag == 'trajectory':
                batch, version, worker_n = contents
                if version == self._agent_version:
                    if len(trajectories[worker_n]) < n_traj_per_worker:
                        trajectories[worker_n].append(batch)
                    if len(trajectories[worker_n]) == n_traj_per_worker:
                        pbar.inc(1)
                        try:
                            self._to_worker[worker_n].put_nowait(('stop', ()))
                        except queue.Full:
                            pass
            else:
                raise AssertionError('Unknown tag {} with contents {}'.format(
                    tag, contents))

        for q in self._to_worker:
            try:
                q.put_nowait(('stop', ()))
            except queue.Full:
                pass
        pbar.stop()
        ordered_trajectories = list(
            itertools.chain(
                *[trajectories[i] for i in range(self._factory.n_workers)]))
        return TrajectoryBatch.concatenate(*ordered_trajectories)
예제 #11
0
    def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
        """Collect at least a given number transitions (timesteps).

        Args:
            itr(int): The current iteration number. Using this argument is
                deprecated.
            num_samples(int): Minimum number of transitions / timesteps to
                sample.
            agent_update(object): Value which will be passed into the
                `agent_update_fn` before doing rollouts. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_update(object): Value which will be passed into the
                `env_update_fn` before doing rollouts. If a list is passed in,
                it must have length exactly `factory.n_workers`, and will be
                spread across the workers.

        Returns:
            garage.TrajectoryBatch: The batch of collected trajectories.

        Raises:
            AssertionError: On internal errors.

        """
        del itr
        pbar = ProgBarCounter(num_samples)
        batches = []
        completed_samples = 0
        self._agent_version += 1
        updated_workers = set()
        agent_ups = self._factory.prepare_worker_messages(
            agent_update, cloudpickle.dumps)
        env_ups = self._factory.prepare_worker_messages(env_update)

        while completed_samples < num_samples:
            self._push_updates(updated_workers, agent_ups, env_ups)
            for _ in range(self._factory.n_workers):
                try:
                    tag, contents = self._to_sampler.get_nowait()
                    if tag == 'trajectory':
                        batch, version, worker_n = contents
                        del worker_n
                        if version == self._agent_version:
                            batches.append(batch)
                            num_returned_samples = batch.lengths.sum()
                            completed_samples += num_returned_samples
                            pbar.inc(num_returned_samples)
                        else:
                            # Receiving paths from previous iterations is
                            # normal.  Potentially, we could gather them here,
                            # if an off-policy method wants them.
                            pass
                    else:
                        raise AssertionError(
                            'Unknown tag {} with contents {}'.format(
                                tag, contents))
                except queue.Empty:
                    pass
        for q in self._to_worker:
            try:
                q.put_nowait(('stop', ()))
            except queue.Full:
                pass
        pbar.stop()
        return TrajectoryBatch.concatenate(*batches)
예제 #12
0
def standard_eval(env,
                  policy,
                  n_episodes=20,
                  greedy=True,
                  load_from_file=False,
                  render=False,
                  recorder=None,
                  max_steps=10000):
    if recorder is not None:
        render = False  # force off
    if load_from_file:
        logger.add_output(dowel.StdOutput())
    logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format(
        n_episodes, greedy))
    episode_rewards = []
    pbar = ProgBarCounter(n_episodes)
    for e in range(n_episodes):
        obs = env.reset()
        policy.reset([True])
        terminated = False
        t = 0
        episode_rewards.append(0)
        while not terminated:
            if render:
                env.render()
                # time.sleep(0.05)
            if recorder is not None:
                recorder.capture_frame()
            if not env.centralized:
                # obs.shape = (n_agents, n_envs, obs_dim)
                obs = torch.Tensor(obs).unsqueeze(1)  # add n_envs dim
                avail_actions = torch.Tensor(
                    env.get_avail_actions()).unsqueeze(1)
                actions, agent_infos = policy.get_actions(obs,
                                                          avail_actions,
                                                          greedy=greedy)
                if len(actions.shape) == 3:  # n-d action
                    actions = actions[:, 0, :]
                elif len(actions.shape) == 2:  # 1-d action
                    actions = actions[:, 0]
                obs, reward, terminated, info = env.step(actions)  # n_env = 1
                terminated = all(terminated)
            else:
                # obs.shape = (n_envs, n_agents * obs_dim)
                obs = np.array([obs])
                avail_actions = np.array([env.get_avail_actions()])
                actions, agent_infos = policy.get_actions(obs,
                                                          avail_actions,
                                                          greedy=greedy)
                obs, reward, terminated, info = env.step(
                    actions[0])  # n_env = 1
            t += 1
            if t > max_steps:
                terminated = True
            episode_rewards[-1] += np.mean(reward)
        pbar.inc(1)
    pbar.stop()
    policy.reset([True])
    avg_return = np.mean(episode_rewards)
    logger.log('EvalAvgReturn: {}'.format(avg_return))
    if not load_from_file:
        tabular.record('EvalAvgReturn', avg_return)
예제 #13
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, next_obses,
                    env_infos, agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        next_observations=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["next_observations"].append(
                    next_observation)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]["observations"]),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]["actions"]),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["rewards"]),
                             next_observation=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["next_observations"]),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["env_infos"]),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["agent_infos"])))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
예제 #14
0
    def eval(self,
             policy,
             n_episodes=20,
             greedy=True,
             load_from_file=False,
             max_steps=60):
        import dowel
        from dowel import logger, tabular
        from garage.misc.prog_bar_counter import ProgBarCounter

        if load_from_file:
            logger.add_output(dowel.StdOutput())
        logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format(
            n_episodes, greedy))
        episode_rewards = []
        success = 0
        pbar = ProgBarCounter(n_episodes)
        for e in range(n_episodes):
            obs = self.reset()
            policy.reset([True])
            terminated = False
            t = 0
            episode_rewards.append(0)
            while not terminated:
                if not self.centralized:
                    # obs.shape = (n_agents, n_envs, obs_dim)
                    obs = torch.Tensor(obs).unsqueeze(1)  # add n_envs dim
                    avail_actions = torch.Tensor(
                        self.get_avail_actions()).unsqueeze(1)
                    actions, agent_infos = policy.get_actions(obs,
                                                              avail_actions,
                                                              greedy=greedy)
                    if len(actions.shape) == 3:  # n-d action
                        actions = actions[:, 0, :]
                    elif len(actions.shape) == 2:  # 1-d action
                        actions = actions[:, 0]
                    obs, reward, terminated, info = self.step(
                        actions)  # n_env = 1
                    terminated = all(terminated)
                else:
                    # obs.shape = (n_envs, n_agents * obs_dim)
                    obs = np.array([obs])
                    avail_actions = np.array([self.get_avail_actions()])
                    actions, agent_infos = policy.get_actions(obs,
                                                              avail_actions,
                                                              greedy=greedy)
                    obs, reward, terminated, info = self.step(
                        actions[0])  # n_env = 1
                t += 1
                if t >= max_steps:
                    terminated = True
                episode_rewards[-1] += np.mean(reward)
            # episode end
            success += self.stat['success']
            pbar.inc(1)
        pbar.stop()
        policy.reset([True])
        avg_return = np.mean(episode_rewards)
        success = success / n_episodes
        logger.log('EvalAvgReturn: {}'.format(avg_return))
        logger.log('EvalSucessRate: {}'.format(success))
        if not load_from_file:
            tabular.record('EvalAvgReturn', avg_return)
            tabular.record('EvalSucessRate', success)
예제 #15
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        If batch size is not specified, episode per task by default is 1 so
        batch size will be meta_batch_size * max_path_length.

        When number of workers are less than meta batch size, sampling will
        be performed for each of self._vec_envs_indices in series. The
        i-th value of self._vec_envs_indices represents the indices of the
        environments/tasks to be sampled for the i-th iteration.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            OrderedDict: Sample paths. Key represents the index of the
                environment/task and value represents all the paths sampled
                from that particular environment/task.


        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape :math:`[N, S^*]`
                * actions: numpy.ndarray with shape :math:`[N, S^*]`
                * rewards: numpy.ndarray with shape :math:`[N, S^*]`
                * dones: numpy.ndarray with shape :math:`[N, S^*]`
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if batch_size is None:
            batch_size = self.algo.max_path_length * self._meta_batch_size

        paths = []

        tasks = self.env.sample_tasks(self._meta_batch_size)

        # Start main loop
        batch_size_per_loop = batch_size // len(self._vec_envs_indices)
        for vec_envs_indices in self._vec_envs_indices:
            self._setup_worker(vec_envs_indices, tasks)

            n_samples = 0
            obses = self._vec_env.reset()
            dones = np.asarray([True] * self._vec_env.num_envs)
            running_paths = [None] * self._vec_env.num_envs

            pbar = ProgBarCounter(batch_size)
            policy_time = 0
            env_time = 0
            process_time = 0

            policy = self.algo.policy
            # Only reset policies at the beginning of a meta batch
            policy.reset(dones)

            while n_samples < batch_size_per_loop:
                t = time.time()

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = self._vec_env.step(
                    actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            dones=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 dones=np.asarray(running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 batch_idx=idx))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.inc(len(obses))
                obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)