示例#1
0
    def init_plot(self, env, policy):
        if not Plotter.enable:
            return
        if not (self._process and self._queue):
            self.init_worker()

        # Needed in order to draw glfw window on the main thread
        if ('Darwin' in platform.platform()):
            rollout(
                env, policy, max_path_length=np.inf, animated=True, speedup=5)

        self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))
示例#2
0
def simulate_policy(args):
    with tf.Session():
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            env = data['algo'].env
        else:
            policy = data['policy']
            env = data['env']

        while True:
            rollout(env, policy,
                    max_path_length=args.max_path_length,
                    animated=True, speedup=args.speedup)
示例#3
0
    def _worker_start(self):
        env = None
        policy = None
        max_length = None
        initial_rollout = True
        try:
            # Each iteration will process ALL messages currently in the
            # queue
            while True:
                msgs = {}
                # If true, block and yield processor
                if initial_rollout:
                    msg = self._queue.get()
                    msgs[msg.op] = msg
                    # Only fetch the last message of each type
                    while not self._queue.empty():
                        msg = self._queue.get()
                        msgs[msg.op] = msg
                else:
                    # Only fetch the last message of each type
                    while not self._queue.empty():
                        msg = self._queue.get_nowait()
                        msgs[msg.op] = msg

                if Op.STOP in msgs:
                    break
                elif Op.UPDATE in msgs:
                    env, policy = msgs[Op.UPDATE].args
                elif Op.DEMO in msgs:
                    param_values, max_length = msgs[Op.DEMO].args
                    policy.set_param_values(param_values)
                    initial_rollout = False
                    rollout(
                        env,
                        policy,
                        max_path_length=max_length,
                        animated=True,
                        speedup=5)
                else:
                    if max_length:
                        rollout(
                            env,
                            policy,
                            max_path_length=max_length,
                            animated=True,
                            speedup=5)
        except KeyboardInterrupt:
            pass
示例#4
0
def obtain_evaluation_samples(policy, env, max_path_length=1000,
                              num_trajs=100):
    """Sample the policy for num_trajs trajectories and return average values.

    Args:
        policy (garage.Policy): Policy to use as the actor when
            gathering samples.
        env (garage.envs.GarageEnv): The environement used to obtain
            trajectories.
        max_path_length (int): Maximum path length. The episode will
            terminate when length of trajectory reaches max_path_length.
        num_trajs (int): Number of trajectories.

    Returns:
        TrajectoryBatch: Evaluation trajectories, representing the best
            current performance of the algorithm.

    """
    paths = []
    # Use a finite length rollout for evaluation.

    for _ in range(num_trajs):
        path = rollout(env,
                       policy,
                       max_path_length=max_path_length,
                       deterministic=True)
        paths.append(path)
    return TrajectoryBatch.from_trajectory_list(env.spec, paths)
示例#5
0
def simulate_policy(args):
    with tf.Session():
        data = joblib.load(args.file)
        if 'algo' in data.keys():
            policy = data['algo'].policy
            env = data['algo'].env
        else:
            policy = data['policy']
            env = data['env']

        while True:
            rollout(env,
                    policy,
                    max_path_length=args.max_path_length,
                    animated=True,
                    speedup=args.speedup)
示例#6
0
def obtain_evaluation_episodes(policy,
                               env,
                               max_episode_length=1000,
                               num_eps=100):
    """Sample the policy for num_eps episodes and return average values.

    Args:
        policy (Policy): Policy to use as the actor when gathering samples.
        env (Environment): The environement used to obtain episodes.
        max_episode_length (int): Maximum episode length. The episode will
            truncated when length of episode reaches max_episode_length.
        num_eps (int): Number of episodes.

    Returns:
        EpisodeBatch: Evaluation episodes, representing the best current
            performance of the algorithm.

    """
    episodes = []
    # Use a finite length rollout for evaluation.

    for _ in range(num_eps):
        eps = rollout(env,
                      policy,
                      max_episode_length=max_episode_length,
                      deterministic=True)
        episodes.append(eps)
    return EpisodeBatch.from_list(env.spec, episodes)
示例#7
0
def _worker_rollout_policy(g, args):
    sample_std = args["sample_std"].flatten()
    cur_mean = args["cur_mean"].flatten()
    n_evals = args["n_evals"]
    k = len(cur_mean)
    params = np.random.standard_normal(k) * sample_std + cur_mean
    g.policy.set_param_values(params)
    paths, returns, undiscounted_returns = [], [], []
    for _ in range(n_evals):
        path = rollout(g.env, g.policy, args["max_path_length"])
        path["returns"] = discount_cumsum(path["rewards"], args["discount"])
        path["undiscounted_return"] = sum(path["rewards"])
        paths.append(path)
        returns.append(path["returns"])
        undiscounted_returns.append(path["undiscounted_return"])

    result_path = {'full_paths': paths}
    result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns)
    result_path['returns'] = _get_stderr_lb_varyinglens(returns)

    # not letting n_evals count towards below cases since n_evals is multiple
    # eval for single paramset
    if args["criterion"] == "samples":
        inc = len(path["rewards"])
    elif args["criterion"] == "paths":
        inc = 1
    else:
        raise NotImplementedError
    return (params, result_path), inc
    def _obtain_evaluation_samples(self,
                                   env,
                                   num_trajs=100,
                                   max_path_length=1000):
        """Sample the policy for 10 trajectories and return average values.

        Args:
            env (garage.envs.GarageEnv): The environement used to obtain
                trajectories.
            num_trajs (int): Number of trajectories.
            max_path_length (int): Number of maximum steps in one batch.

        Returns:
            TrajectoryBatch: Evaluation trajectories, representing the best
                current performance of the algorithm.

        """
        paths = []

        for _ in range(num_trajs):
            path = rollout(env,
                           self.policy,
                           max_path_length=max_path_length,
                           deterministic=True)
            paths.append(path)
        return TrajectoryBatch.from_trajectory_list(self.env_spec, paths)
示例#9
0
 def test_max_episode_length(self):
     # pylint: disable=unsubscriptable-object
     path = utils.rollout(self.env, self.policy, max_episode_length=3)
     assert path['observations'].shape[0] == 3
     assert path['actions'].shape[0] == 3
     assert path['rewards'].shape[0] == 3
     assert path['agent_infos']['dummy'].shape[0] == 3
     assert path['env_infos']['dummy'].shape[0] == 3
示例#10
0
文件: cma_es.py 项目: psxz/garage
def sample_return(g, params, max_path_length, discount):
    # env, policy, params, max_path_length, discount = args
    # of course we make the strong assumption that there is no race condition
    g.policy.set_param_values(params)
    path = rollout(
        g.env,
        g.policy,
        max_path_length,
    )
    path['returns'] = discount_cumsum(path['rewards'], discount)
    path['undiscounted_return'] = sum(path['rewards'])
    return path
示例#11
0
 def test_max_path_length(self):
     # pylint: disable=unsubscriptable-object
     path = utils.rollout(self.env, self.policy, max_path_length=3)
     assert path['observations'].shape[0] == 3
     assert path['actions'].shape[0] == 3
     assert path['rewards'].shape[0] == 3
     agent_info = [
         path['agent_infos'][k]
         for k in self.policy.distribution.dist_info_keys
     ]
     assert agent_info[0].shape[0] == 3
     # dummy is the env_info_key
     assert path['env_infos']['dummy'].shape[0] == 3
示例#12
0
文件: plotter.py 项目: songanz/garage
    def init_plot(self, env, policy):
        """Initialize the plotter.

        Args:
            env (GarageEnv): Environment to visualize.
            policy (garage.np.policies.Policy): Policy to roll out in the
                visualization.

        """
        if not Plotter.enable:
            return
        if not (self._process and self._queue):
            self._init_worker()

        # Needed in order to draw glfw window on the main thread
        if 'Darwin' in platform.platform():
            rollout(env,
                    policy,
                    max_episode_length=np.inf,
                    animated=True,
                    speedup=5)

        self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))
示例#13
0
    def test_snapshot(self):
        with LocalRunner() as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01)

            runner.setup(algo, env)
            runner.train(n_epochs=self.verifyItrs, batch_size=4000)

            env.close()

        # Read snapshot from self.log_dir
        # Test the presence and integrity of policy and env
        for i in range(0, self.verifyItrs):
            self.reset_tf()
            with LocalRunner():
                snapshot = joblib.load(
                    osp.join(self.log_dir.name, 'itr_{}.pkl'.format(i)))

                env = snapshot['env']
                algo = snapshot['algo']
                assert env
                assert algo
                assert algo.policy

                rollout(env, algo.policy, animated=False)
示例#14
0
    def _obtain_evaluation_samples(self,
                                   env,
                                   num_trajs=100,
                                   max_path_length=1000):
        r"""Sample the policy for 10 trajectories and return average values.

        Args:
            env (garage.envs.GarageEnv): The environement used to obtain
                trajectories.
            num_trajs (int): Number of trajectories.
            max_path_length (int): Number of maximum steps in one batch.

        Returns:
            dict: Evaluation trajectories, representing the best current
                performance of the algorithm, with keys:
                * env_spec (garage.envs.EnvSpec): Specification for the
                environment from which this data was sampled.
                * observations (numpy.ndarray): A numpy array containing the
                    observations for all time steps in this batch.
                * actions (numpy.ndarray): A  numpy array containing the
                    actions for all time steps in this batch.
                * rewards (numpy.ndarray): A numpy array containing the
                    rewards for all time steps in this batch.
                * terminals (numpy.ndarray): A boolean numpy array
                    containing the termination signals for all time steps
                    in this batch.
                * env_infos (dict): A dict of numpy arrays arbitrary
                    environment state information.
                * agent_infos (numpy.ndarray): A dict of numpy arrays
                    arbitrary agent state information.
                * lengths (numpy.ndarray): An integer numpy array
                    containing the length of each trajectory in this batch.
                * discount (float): Discount value.

        """
        paths = []

        for _ in range(num_trajs):
            path = rollout(env,
                           self.policy,
                           max_path_length=max_path_length,
                           deterministic=True)
            paths.append(path)

        obs = [path['observations'] for path in paths]
        actions = [path['actions'] for path in paths]
        rewards = [path['rewards'] for path in paths]
        agent_infos = [path['agent_infos'] for path in paths]
        env_infos = [path['env_infos'] for path in paths]
        terminals = [path['dones'] for path in paths]
        lengths = [len(path['rewards']) for path in paths]

        return dict(env_spec=self.env_spec,
                    observations=obs,
                    actions=actions,
                    rewards=rewards,
                    terminals=terminals,
                    env_infos=env_infos,
                    agent_infos=agent_infos,
                    lengths=lengths,
                    discount=self.discount)
示例#15
0
 def test_deterministic_action(self):
     path = utils.rollout(self.env,
                          self.policy,
                          max_path_length=5,
                          deterministic=True)
     assert (path['actions'] == 0.).all()
示例#16
0
 def test_does_flatten(self):
     path = utils.rollout(self.env, self.policy, max_path_length=5)
     assert path['observations'][0].shape == (16, )
     assert path['actions'][0].shape == (2, 2)
示例#17
0
def _worker_collect_one_path(g, max_path_length, scope=None):
    g = _get_scoped_g(g, scope)
    path = rollout(g.env, g.policy, max_path_length)
    return path, len(path["rewards"])
示例#18
0
def _worker_collect_one_path_on_traj(g, max_path_length, scope=None):
    g = _get_scoped_g(g, scope)
    path = rollout(g.env, g.policy, max_path_length)
    return path, 1
示例#19
0
                        help='use the mean action or stochastic action',
                        action='store_true')
    args = parser.parse_args()
    print(args)
    # If the snapshot file use tensorflow, do:
    # import tensorflow as tf
    # with tf.compat.v1.Session():
    #     [rest of the code]
    with tf.compat.v1.Session() as sess:
        data = joblib.load(args.file)
        policy = data['algo'].policy
        env = data['env']
        while True:
            path = rollout(env,
                           policy,
                           max_path_length=args.max_path_length,
                           animated=True,
                           speedup=args.speedup,
                           deterministic=args.deterministic)

            plt.figure()
            plt.title('observations')
            plt.xlabel('time steps')
            plt.plot(range(args.max_path_length), path['observations'])

            plt.figure()
            plt.title('actions')
            plt.xlabel('time steps')
            plt.plot(range(args.max_path_length), path['actions'])

            plt.figure()
            plt.title('rewards')
示例#20
0
                             "(or 'y' or 'n').\n")


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, help='path to the snapshot file')
    parser.add_argument('--max_episode_length',
                        type=int,
                        default=1000,
                        help='Max length of episode')
    parser.add_argument('--speedup', type=float, default=1, help='Speedup')
    args = parser.parse_args()

    # If the snapshot file use tensorflow, do:
    # import tensorflow as tf
    # with tf.compat.v1.Session():
    #     [rest of the code]
    with tf.compat.v1.Session() as sess:
        data = cloudpickle.load(args.file)
        policy = data['algo'].policy
        env = data['env']
        while True:
            path = rollout(env,
                           policy,
                           max_episode_length=args.max_episode_length,
                           animated=True,
                           speedup=args.speedup)
            if not query_yes_no('Continue simulation?'):
                break