def run_metarl(env, envs, tasks, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), ) ) # Need to pass this if meta_batch_size < num_of_tasks task_names = list(ML45_ENVS['train'].keys()) algo = RL2( policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=tasks, task_names=None if hyper_parameters['meta_batch_size'] >= len(task_names) else task_names) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, envs, sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) # meta evaluator env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) ML_test_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['test'][task]['args'], **ML45_ARGS['test'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['test'].items()) ] test_tasks = task_sampler.EnvPoolSampler(ML_test_envs) test_tasks.grow_pool(hyper_parameters['n_test_tasks']) test_task_names = list(ML45_ENVS['test'].keys()) runner.setup_meta_evaluator(test_task_sampler=test_tasks, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks'], test_task_names=None if hyper_parameters['n_test_tasks'] >= len(test_task_names) else test_task_names) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file
def run_metarl(env, test_env, seed, log_dir): """Create metarl model and training.""" deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) train_task_names = ML10.get_train_tasks()._task_names test_task_names = ML10.get_test_tasks()._task_names pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], train_task_names=train_task_names, test_task_names=test_task_names, ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size']) dowel_logger.remove_all() return tabular_log_file
import gc import os from dowel import logger import tensorflow as tf from metarl.experiment import deterministic from metarl.experiment.snapshotter import SnapshotConfig from tests.fixtures.logger import NullOutput path = os.path.join(os.getcwd(), 'data/local/experiment') snapshot_config = SnapshotConfig(snapshot_dir=path, snapshot_mode='last', snapshot_gap=1) class TfTestCase: def setup_method(self): self.sess = tf.compat.v1.Session() self.sess.__enter__() def teardown_method(self): self.sess.__exit__(None, None, None) self.sess.close() del self.sess gc.collect() class TfGraphTestCase: def setup_method(self): self.graph = tf.Graph()
def run_metarl(env, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: env, task_samplers = _prepare_meta_env(env) policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), )) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=task_samplers) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, task_samplers.sample(hyper_parameters['meta_batch_size']), sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) runner.setup_meta_evaluator( test_task_sampler=task_samplers, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks']) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file