def init_plot(self, env, policy): if not Plotter.enable: return if not (self._process and self._queue): self.init_worker() # Needed in order to draw glfw window on the main thread if ('Darwin' in platform.platform()): rollout( env, policy, max_path_length=np.inf, animated=True, speedup=5) self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))
def simulate_policy(args): with tf.Session(): data = joblib.load(args.file) if 'algo' in data.keys(): policy = data['algo'].policy env = data['algo'].env else: policy = data['policy'] env = data['env'] while True: rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup)
def _worker_start(self): env = None policy = None max_length = None initial_rollout = True try: # Each iteration will process ALL messages currently in the # queue while True: msgs = {} # If true, block and yield processor if initial_rollout: msg = self._queue.get() msgs[msg.op] = msg # Only fetch the last message of each type while not self._queue.empty(): msg = self._queue.get() msgs[msg.op] = msg else: # Only fetch the last message of each type while not self._queue.empty(): msg = self._queue.get_nowait() msgs[msg.op] = msg if Op.STOP in msgs: break elif Op.UPDATE in msgs: env, policy = msgs[Op.UPDATE].args elif Op.DEMO in msgs: param_values, max_length = msgs[Op.DEMO].args policy.set_param_values(param_values) initial_rollout = False rollout( env, policy, max_path_length=max_length, animated=True, speedup=5) else: if max_length: rollout( env, policy, max_path_length=max_length, animated=True, speedup=5) except KeyboardInterrupt: pass
def obtain_evaluation_samples(policy, env, max_path_length=1000, num_trajs=100): """Sample the policy for num_trajs trajectories and return average values. Args: policy (garage.Policy): Policy to use as the actor when gathering samples. env (garage.envs.GarageEnv): The environement used to obtain trajectories. max_path_length (int): Maximum path length. The episode will terminate when length of trajectory reaches max_path_length. num_trajs (int): Number of trajectories. Returns: TrajectoryBatch: Evaluation trajectories, representing the best current performance of the algorithm. """ paths = [] # Use a finite length rollout for evaluation. for _ in range(num_trajs): path = rollout(env, policy, max_path_length=max_path_length, deterministic=True) paths.append(path) return TrajectoryBatch.from_trajectory_list(env.spec, paths)
def obtain_evaluation_episodes(policy, env, max_episode_length=1000, num_eps=100): """Sample the policy for num_eps episodes and return average values. Args: policy (Policy): Policy to use as the actor when gathering samples. env (Environment): The environement used to obtain episodes. max_episode_length (int): Maximum episode length. The episode will truncated when length of episode reaches max_episode_length. num_eps (int): Number of episodes. Returns: EpisodeBatch: Evaluation episodes, representing the best current performance of the algorithm. """ episodes = [] # Use a finite length rollout for evaluation. for _ in range(num_eps): eps = rollout(env, policy, max_episode_length=max_episode_length, deterministic=True) episodes.append(eps) return EpisodeBatch.from_list(env.spec, episodes)
def _worker_rollout_policy(g, args): sample_std = args["sample_std"].flatten() cur_mean = args["cur_mean"].flatten() n_evals = args["n_evals"] k = len(cur_mean) params = np.random.standard_normal(k) * sample_std + cur_mean g.policy.set_param_values(params) paths, returns, undiscounted_returns = [], [], [] for _ in range(n_evals): path = rollout(g.env, g.policy, args["max_path_length"]) path["returns"] = discount_cumsum(path["rewards"], args["discount"]) path["undiscounted_return"] = sum(path["rewards"]) paths.append(path) returns.append(path["returns"]) undiscounted_returns.append(path["undiscounted_return"]) result_path = {'full_paths': paths} result_path['undiscounted_return'] = _get_stderr_lb(undiscounted_returns) result_path['returns'] = _get_stderr_lb_varyinglens(returns) # not letting n_evals count towards below cases since n_evals is multiple # eval for single paramset if args["criterion"] == "samples": inc = len(path["rewards"]) elif args["criterion"] == "paths": inc = 1 else: raise NotImplementedError return (params, result_path), inc
def _obtain_evaluation_samples(self, env, num_trajs=100, max_path_length=1000): """Sample the policy for 10 trajectories and return average values. Args: env (garage.envs.GarageEnv): The environement used to obtain trajectories. num_trajs (int): Number of trajectories. max_path_length (int): Number of maximum steps in one batch. Returns: TrajectoryBatch: Evaluation trajectories, representing the best current performance of the algorithm. """ paths = [] for _ in range(num_trajs): path = rollout(env, self.policy, max_path_length=max_path_length, deterministic=True) paths.append(path) return TrajectoryBatch.from_trajectory_list(self.env_spec, paths)
def test_max_episode_length(self): # pylint: disable=unsubscriptable-object path = utils.rollout(self.env, self.policy, max_episode_length=3) assert path['observations'].shape[0] == 3 assert path['actions'].shape[0] == 3 assert path['rewards'].shape[0] == 3 assert path['agent_infos']['dummy'].shape[0] == 3 assert path['env_infos']['dummy'].shape[0] == 3
def sample_return(g, params, max_path_length, discount): # env, policy, params, max_path_length, discount = args # of course we make the strong assumption that there is no race condition g.policy.set_param_values(params) path = rollout( g.env, g.policy, max_path_length, ) path['returns'] = discount_cumsum(path['rewards'], discount) path['undiscounted_return'] = sum(path['rewards']) return path
def test_max_path_length(self): # pylint: disable=unsubscriptable-object path = utils.rollout(self.env, self.policy, max_path_length=3) assert path['observations'].shape[0] == 3 assert path['actions'].shape[0] == 3 assert path['rewards'].shape[0] == 3 agent_info = [ path['agent_infos'][k] for k in self.policy.distribution.dist_info_keys ] assert agent_info[0].shape[0] == 3 # dummy is the env_info_key assert path['env_infos']['dummy'].shape[0] == 3
def init_plot(self, env, policy): """Initialize the plotter. Args: env (GarageEnv): Environment to visualize. policy (garage.np.policies.Policy): Policy to roll out in the visualization. """ if not Plotter.enable: return if not (self._process and self._queue): self._init_worker() # Needed in order to draw glfw window on the main thread if 'Darwin' in platform.platform(): rollout(env, policy, max_episode_length=np.inf, animated=True, speedup=5) self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))
def test_snapshot(self): with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=self.verifyItrs, batch_size=4000) env.close() # Read snapshot from self.log_dir # Test the presence and integrity of policy and env for i in range(0, self.verifyItrs): self.reset_tf() with LocalRunner(): snapshot = joblib.load( osp.join(self.log_dir.name, 'itr_{}.pkl'.format(i))) env = snapshot['env'] algo = snapshot['algo'] assert env assert algo assert algo.policy rollout(env, algo.policy, animated=False)
def _obtain_evaluation_samples(self, env, num_trajs=100, max_path_length=1000): r"""Sample the policy for 10 trajectories and return average values. Args: env (garage.envs.GarageEnv): The environement used to obtain trajectories. num_trajs (int): Number of trajectories. max_path_length (int): Number of maximum steps in one batch. Returns: dict: Evaluation trajectories, representing the best current performance of the algorithm, with keys: * env_spec (garage.envs.EnvSpec): Specification for the environment from which this data was sampled. * observations (numpy.ndarray): A numpy array containing the observations for all time steps in this batch. * actions (numpy.ndarray): A numpy array containing the actions for all time steps in this batch. * rewards (numpy.ndarray): A numpy array containing the rewards for all time steps in this batch. * terminals (numpy.ndarray): A boolean numpy array containing the termination signals for all time steps in this batch. * env_infos (dict): A dict of numpy arrays arbitrary environment state information. * agent_infos (numpy.ndarray): A dict of numpy arrays arbitrary agent state information. * lengths (numpy.ndarray): An integer numpy array containing the length of each trajectory in this batch. * discount (float): Discount value. """ paths = [] for _ in range(num_trajs): path = rollout(env, self.policy, max_path_length=max_path_length, deterministic=True) paths.append(path) obs = [path['observations'] for path in paths] actions = [path['actions'] for path in paths] rewards = [path['rewards'] for path in paths] agent_infos = [path['agent_infos'] for path in paths] env_infos = [path['env_infos'] for path in paths] terminals = [path['dones'] for path in paths] lengths = [len(path['rewards']) for path in paths] return dict(env_spec=self.env_spec, observations=obs, actions=actions, rewards=rewards, terminals=terminals, env_infos=env_infos, agent_infos=agent_infos, lengths=lengths, discount=self.discount)
def test_deterministic_action(self): path = utils.rollout(self.env, self.policy, max_path_length=5, deterministic=True) assert (path['actions'] == 0.).all()
def test_does_flatten(self): path = utils.rollout(self.env, self.policy, max_path_length=5) assert path['observations'][0].shape == (16, ) assert path['actions'][0].shape == (2, 2)
def _worker_collect_one_path(g, max_path_length, scope=None): g = _get_scoped_g(g, scope) path = rollout(g.env, g.policy, max_path_length) return path, len(path["rewards"])
def _worker_collect_one_path_on_traj(g, max_path_length, scope=None): g = _get_scoped_g(g, scope) path = rollout(g.env, g.policy, max_path_length) return path, 1
help='use the mean action or stochastic action', action='store_true') args = parser.parse_args() print(args) # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.compat.v1.Session(): # [rest of the code] with tf.compat.v1.Session() as sess: data = joblib.load(args.file) policy = data['algo'].policy env = data['env'] while True: path = rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup, deterministic=args.deterministic) plt.figure() plt.title('observations') plt.xlabel('time steps') plt.plot(range(args.max_path_length), path['observations']) plt.figure() plt.title('actions') plt.xlabel('time steps') plt.plot(range(args.max_path_length), path['actions']) plt.figure() plt.title('rewards')
"(or 'y' or 'n').\n") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--max_episode_length', type=int, default=1000, help='Max length of episode') parser.add_argument('--speedup', type=float, default=1, help='Speedup') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.compat.v1.Session(): # [rest of the code] with tf.compat.v1.Session() as sess: data = cloudpickle.load(args.file) policy = data['algo'].policy env = data['env'] while True: path = rollout(env, policy, max_episode_length=args.max_episode_length, animated=True, speedup=args.speedup) if not query_yes_no('Continue simulation?'): break