def test_output_sym(self): with tf.Session() as sess: obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_output_sym', hidden_sizes=(64, 64)) obs_ph_1 = tf.placeholder(dtype=tf.float32, name="obs_ph_1", shape=(None, ) + self.env.observation_space.shape) output_sym_1 = self.policy.distribution_info_sym(obs_ph_1) sess.run(tf.global_variables_initializer()) n_obs = self.env.get_obs(n=100) action, agent_infos = self.policy.get_actions(n_obs) agent_infos_output_sym = sess.run(output_sym_1, feed_dict={obs_ph_1: n_obs}) for k in agent_infos.keys(): self.assertTrue( np.allclose(agent_infos[k], agent_infos_output_sym[k], rtol=1e-5, atol=1e-5))
def testSerialize1(self): obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_serialize', hidden_sizes=(64, 64)) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) all_param_values = self.policy.get_param_values() self.policy.set_params(all_param_values)
def __init__(self, meta_batch_size, *args, **kwargs): self.quick_init(locals()) # store init arguments for serialization self.meta_batch_size = meta_batch_size self.pre_update_action_var = None self.pre_update_mean_var = None self.pre_update_log_std_var = None self.post_update_action_var = None self.post_update_mean_var = None self.post_update_log_std_var = None GaussianMLPPolicy.__init__(self, *args, **kwargs) # MetaPolicy.__init__(self, *args, **kwargs) # super does not call MetaPolicy.__init__() self._pre_update_mode = True
def testSerialize2(self): obs_dim = 2 action_dim = 7 env = DummyEnv(obs_dim, action_dim) policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_serialize2', hidden_sizes=(54, 23)) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) obs = env.get_obs() _, pre_agent_infos = policy.get_action(obs) pkl_str = pickle.dumps(policy) tf.reset_default_graph() with tf.Session() as sess: policy_unpickled = pickle.loads(pkl_str) _, post_agent_infos = policy_unpickled.get_action(obs) for key in pre_agent_infos.keys(): self.assertTrue( np.allclose(pre_agent_infos[key], post_agent_infos[key]))
def test_get_action(self): with tf.Session() as sess: obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_get_action', hidden_sizes=(64, 64)) sess.run(tf.global_variables_initializer()) obs = self.env.get_obs() action, agent_infos = self.policy.get_action(obs) actions, agents_infos = self.policy.get_actions( np.expand_dims(obs, 0)) for k in agent_infos.keys(): self.assertTrue( np.allclose(agent_infos[k], agents_infos[k], rtol=1e-5, atol=1e-5))
def main(config): baseline = LinearFeatureBaseline() env = normalize(HopperRandParamsEnv()) obs_dim = np.prod(env.observation_space.shape) policy = GaussianMLPPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=5, ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO(policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], ) trainer.train()
def run_experiment(**kwargs): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) env = normalize(kwargs['env']()) # Wrappers? baseline = NNValueFun( 'value-function', env, hidden_nonlinearity=kwargs['vfun_hidden_nonlinearity'], hidden_sizes=kwargs['vfun_hidden_sizes'], output_nonlinearity=kwargs['vfun_output_nonlinearity'], learning_rate=kwargs['vfun_learning_rate'], batch_size=kwargs['vfun_batch_size'], buffer_size=kwargs['vfun_buffer_size'], normalize_input=False, ) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsModel( 'prob-dynamics', env=env, hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], normalize_input=False, ) assert kwargs['num_rollouts'] % kwargs['n_parallel'] == 0 sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = SVG1( policy=policy, dynamics_model=dynamics_model, value_function=baseline, tf_reward=env.tf_reward, learning_rate=kwargs['svg_learning_rate'], num_grad_steps=kwargs['num_rollouts'] * kwargs['max_path_length'] // kwargs['svg_batch_size'], batch_size=kwargs['svg_batch_size'], discount=kwargs['discount'], kl_penalty=kwargs['kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, dynamics_model=dynamics_model, value_function=baseline, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], vfun_max_epochs=kwargs['vfun_max_epochs'], sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) model_sampler = METRPOSampler( env=env, policy=policy, num_rollouts=kwargs['imagined_num_rollouts'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = SampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
def run_experiment(**kwargs): print() exp_dir = os.getcwd( ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') print("\n---------- experiment with dir {} ---------------------------". format(exp_dir)) logger.configure(dir=exp_dir, format_strs=['csv', 'stdout', 'log'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) '''-------- dumps and reloads -----------------''' baseline_pickle = pickle.dumps(baseline) env_pickle = pickle.dumps(env) receiver, sender = Pipe() p = Process( target=init_vars, name="init_vars", args=(sender, config, policy, dynamics_model), daemon=False, ) p.start() policy_pickle, dynamics_model_pickle = receiver.recv() receiver.close() '''-------- following classes depend on baseline, env, policy, dynamics_model -----------''' worker_data_feed_dict = { 'env_sampler': { 'num_rollouts': kwargs['num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'n_parallel': kwargs['n_parallel'], }, 'dynamics_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, } worker_model_feed_dict = {} worker_policy_feed_dict = { 'model_sampler': { 'num_rollouts': kwargs['imagined_num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'dynamics_model': dynamics_model, 'deterministic': kwargs['deterministic'], }, 'model_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, 'algo': { 'learning_rate': kwargs['learning_rate'], 'clip_eps': kwargs['clip_eps'], 'max_epochs': kwargs['num_ppo_steps'], } } trainer = ParallelTrainer( policy_pickle=policy_pickle, env_pickle=env_pickle, baseline_pickle=baseline_pickle, dynamics_model_pickle=dynamics_model_pickle, feed_dicts=[ worker_data_feed_dict, worker_model_feed_dict, worker_policy_feed_dict ], n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], flags_need_query=kwargs['flags_need_query'], config=config, simulation_sleep=kwargs['simulation_sleep'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd( ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], init_std=kwargs['init_std'], ) # Load policy here sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train()
def run_base(exp_dir, **kwargs): config = ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if kwargs['env'] == 'Ant': env = normalize(AntEnv()) simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'HalfCheetah': env = normalize(HalfCheetahEnv()) simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'Hopper': env = normalize(HopperEnv()) simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'Walker2d': env = normalize(Walker2dEnv()) simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] else: raise NotImplementedError policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], rolling_average_persitency=kwargs['rolling_average_persitency'], ) '''-------- dumps and reloads -----------------''' baseline_pickle = pickle.dumps(baseline) env_pickle = pickle.dumps(env) receiver, sender = Pipe() p = Process( target=init_vars, name="init_vars", args=(sender, config, policy, dynamics_model), daemon=True, ) p.start() policy_pickle, dynamics_model_pickle = receiver.recv() receiver.close() '''-------- following classes depend on baseline, env, policy, dynamics_model -----------''' worker_data_feed_dict = { 'env_sampler': { 'num_rollouts': kwargs['num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'n_parallel': kwargs['n_parallel'], }, 'dynamics_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, } worker_model_feed_dict = {} worker_policy_feed_dict = { 'model_sampler': { 'num_rollouts': kwargs['imagined_num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'deterministic': kwargs['deterministic'], }, 'model_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, 'algo': { 'learning_rate': kwargs['learning_rate'], 'clip_eps': kwargs['clip_eps'], 'max_epochs': kwargs['num_ppo_steps'], } } trainer = ParallelTrainer( exp_dir=exp_dir, algo_str=kwargs['algo'], policy_pickle=policy_pickle, env_pickle=env_pickle, baseline_pickle=baseline_pickle, dynamics_model_pickle=dynamics_model_pickle, feed_dicts=[worker_data_feed_dict, worker_model_feed_dict, worker_policy_feed_dict], n_itr=kwargs['n_itr'], flags_need_query=kwargs['flags_need_query'], config=config, simulation_sleep=simulation_sleep, sampler_str=kwargs['sampler'], ) trainer.train()
class TestPolicy(unittest.TestCase): def setUp(self): sess = tf.get_default_session() if sess is None: tf.InteractiveSession() def test_output_sym(self): with tf.Session() as sess: obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_output_sym', hidden_sizes=(64, 64)) obs_ph_1 = tf.placeholder(dtype=tf.float32, name="obs_ph_1", shape=(None, ) + self.env.observation_space.shape) output_sym_1 = self.policy.distribution_info_sym(obs_ph_1) sess.run(tf.global_variables_initializer()) n_obs = self.env.get_obs(n=100) action, agent_infos = self.policy.get_actions(n_obs) agent_infos_output_sym = sess.run(output_sym_1, feed_dict={obs_ph_1: n_obs}) for k in agent_infos.keys(): self.assertTrue( np.allclose(agent_infos[k], agent_infos_output_sym[k], rtol=1e-5, atol=1e-5)) def test_get_action(self): with tf.Session() as sess: obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_get_action', hidden_sizes=(64, 64)) sess.run(tf.global_variables_initializer()) obs = self.env.get_obs() action, agent_infos = self.policy.get_action(obs) actions, agents_infos = self.policy.get_actions( np.expand_dims(obs, 0)) for k in agent_infos.keys(): self.assertTrue( np.allclose(agent_infos[k], agents_infos[k], rtol=1e-5, atol=1e-5)) def testSerialize1(self): obs_dim = 23 action_dim = 7 self.env = DummyEnv(obs_dim, action_dim) self.policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_serialize', hidden_sizes=(64, 64)) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) all_param_values = self.policy.get_param_values() self.policy.set_params(all_param_values) def testSerialize2(self): obs_dim = 2 action_dim = 7 env = DummyEnv(obs_dim, action_dim) policy = GaussianMLPPolicy(obs_dim, action_dim, name='test_policy_serialize2', hidden_sizes=(54, 23)) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) obs = env.get_obs() _, pre_agent_infos = policy.get_action(obs) pkl_str = pickle.dumps(policy) tf.reset_default_graph() with tf.Session() as sess: policy_unpickled = pickle.loads(pkl_str) _, post_agent_infos = policy_unpickled.get_action(obs) for key in pre_agent_infos.keys(): self.assertTrue( np.allclose(pre_agent_infos[key], post_agent_infos[key]))