def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 256, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000, GAMMA=0.99, BATCH_SIZE=32, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=int(0.1 * 100000), initial_p=1.0, final_p=0.02), init_random_prob=0.1), noise_adder=None) flow = create_train_test_flow( test_every_sample_count=100, train_every_sample_count=1, start_test_after_sample_count=0, start_train_after_sample_count=1000, sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)), train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=3)), ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy( action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0), init_random_prob=0.5)) flow = create_train_test_flow(test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug') dqn.parameters.set_scheduler( param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig( ).DEFAULT_EXPERIMENT_END_POINT['TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "Q_NET_L1_NORM_SCALE": 0.01, "Q_NET_L2_NORM_SCALE": 0.01, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) agent = Agent( env=env, env_spec=env_spec, algo=ddpg, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel']) dyna_env = DynamicsEnvWrapper(mlp_dyna) dyna_env.set_terminal_reward_func( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=dyna_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) policy = iLQRPolicy(env_spec=env_spec, **exp_config['ILQR'], dynamics=dyna_env, cost_fn=RewardFuncCostWrapper( reward_func=REWARD_FUNC_DICT['Pendulum-v0']())) algo = iLQRAlogWrapper(policy=policy, env_spec=env_spec, dynamics_env=dyna_env) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'train_algo': None, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "L1_NORM": 0.01, "L2_NORM": 0.01, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.01, "L2_NORM": 0.01, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ], reuse=False) ppo = PPO( env_spec=env_spec, config_or_config_dict={ "gamma": 0.995, "lam": 0.98, "policy_train_iter": 10, "value_func_train_iter": 10, "clipping_range": None, "beta": 1.0, "eta": 50, "value_func_memory_size": 10, "log_var_init": -1.0, "kl_target": 0.003, "policy_lr": 0.01, "value_func_lr": 0.01, "value_func_train_batch_size": 10, "lr_multiplier": 1.0 }, value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo' ) agent = Agent(env=env, env_spec=env_spec, algo=ppo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + 'agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, sample_type='trajectory', store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def test_integration_with_dqn(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name='mlp_q', name_scope='mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, name='dqn_test', config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), value_func=mlp_q) dqn.init() st = env.reset() from baconian.common.sampler.sample_data import TransitionData a = TransitionData(env_spec) res = [] for i in range(100): ac = dqn.predict(obs=st, sess=self.sess, batch_flag=False) st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) dqn.append_to_memory(a) res.append( dqn.train(batch_data=a, train_iter=10, sess=None, update_target=True)['average_loss']) res.append( dqn.train(batch_data=None, train_iter=10, sess=None, update_target=True)['average_loss']) self.assertTrue(dqn in dqn.recorder._obj_log) self.assertTrue('average_loss' in dqn.recorder._obj_log[dqn]) self.assertTrue(len(dqn.recorder._obj_log[dqn]['average_loss']) == 2) self.assertTrue( np.equal(np.array(res), [ x['log_val'] for x in dqn.recorder._obj_log[dqn]['average_loss'] ]).all()) self.assertTrue(len(Logger()._registered_recorders) > 0) self.assertTrue(dqn.recorder in Logger()._registered_recorders) Logger().flush_recorder()
def test_mlp_norm_dist_policy(self): env = make('Pendulum-v0') env.reset() env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name='mlp_policy', name_scope='mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], output_high=None, output_low=None, output_norm=None, input_norm=None, reuse=False) policy.init() dist_info = policy.get_dist_info() self.assertTrue( np.equal(dist_info[0]['shape'], policy.mean_output.shape.as_list()).all()) self.assertTrue( np.equal(dist_info[1]['shape'], policy.logvar_output.shape.as_list()).all()) for _ in range(10): ac = policy.forward(obs=env.observation_space.sample()) self.assertTrue(env.action_space.contains(ac[0])) p2 = policy.make_copy(name='test', name_scope='mlp_policy_2', reuse=False) p2.init() self.assertGreater(len(policy.parameters('tf_var_list')), 0) self.assertGreater(len(p2.parameters('tf_var_list')), 0) for var1, var2 in zip(policy.parameters('tf_var_list'), p2.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertNotEqual(id(var1), id(var2)) p3 = policy.make_copy(name='mlp_policy_ttt', name_scope='mlp_policy', reuse=True) p3.init() self.assertGreater(len(p3.parameters('tf_var_list')), 0) for var1, var2 in zip(policy.parameters('tf_var_list'), p3.parameters('tf_var_list')): self.assertEqual(var1.shape, var2.shape) self.assertEqual(id(var1), id(var2)) # policy.copy_from(p2)] res_not_true = [] for var1, var2, var3 in zip(policy.parameters('tf_var_list'), p2.parameters('tf_var_list'), p3.parameters('tf_var_list')): re1, re2, re3 = self.sess.run([var1, var2, var3]) res_not_true.append(np.isclose(re1, re2).all()) res_not_true.append(np.isclose(re3, re2).all()) self.assertTrue(np.isclose(re1, re3).all()) self.assertFalse(np.array(res_not_true).all()) policy.copy_from(p2) for var1, var2, var3 in zip(policy.parameters('tf_var_list'), p2.parameters('tf_var_list'), p3.parameters('tf_var_list')): re1, re2, re3 = self.sess.run([var1, var2, var3]) self.assertTrue(np.isclose(re1, re3).all()) self.assertTrue(np.isclose(re2, re3).all()) self.assertTrue(np.isclose(re1, re2).all())
def create_ddpg(self, env_id='Pendulum-v0', name='ddpg'): env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + 'mlp_q', name=name + 'mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) self.assertTrue(len(mlp_q.parameters('tf_var_list')) == 4) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) self.assertTrue(len(policy.parameters('tf_var_list')) == 4) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name, replay_buffer=None) return ddpg, locals()
def create_env(self, env_id): return make(env_id)
def test_transition_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TransitionData(env_spec) st = env.reset() for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) self.assertEqual(a.reward_set.shape[0], 100) self.assertEqual(a.done_set.shape[0], 100) self.assertEqual(a.action_set.shape[0], 100) self.assertEqual(a.state_set.shape[0], 100) self.assertEqual(a.new_state_set.shape[0], 100) self.assertEqual(a('reward_set').shape[0], 100) self.assertEqual(a('done_set').shape[0], 100) self.assertEqual(a('state_set').shape[0], 100) self.assertEqual(a('new_state_set').shape[0], 100) self.assertEqual(a('action_set').shape[0], 100) iterator = a.return_generator() count = 0 for st, new_st, ac, reward, terminal in iterator: count += 1 self.assertTrue(env_spec.action_space.contains(ac)) self.assertTrue(env_spec.obs_space.contains(st)) self.assertTrue(env_spec.obs_space.contains(new_st)) self.assertTrue(np.isscalar(reward)) self.assertTrue(isinstance(terminal, bool)) self.assertEqual(count, 100) a = TransitionData( obs_shape=list(np.array(env_spec.obs_space.sample()).shape), action_shape=list(np.array(env_spec.action_space.sample()).shape)) st = env.reset() for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) self.assertEqual(a.reward_set.shape[0], 100) self.assertEqual(a.done_set.shape[0], 100) self.assertEqual(a.action_set.shape[0], 100) self.assertEqual(a.state_set.shape[0], 100) self.assertEqual(a.new_state_set.shape[0], 100) self.assertEqual(a('reward_set').shape[0], 100) self.assertEqual(a('done_set').shape[0], 100) self.assertEqual(a('state_set').shape[0], 100) self.assertEqual(a('new_state_set').shape[0], 100) self.assertEqual(a('action_set').shape[0], 100) self.assertTrue( np.equal(a.get_mean_of('state_set'), a.apply_op('state_set', np.mean)).all()) self.assertTrue( np.equal(a.get_sum_of('state_set'), a.apply_op('state_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('reward_set'), a.apply_op('reward_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('reward_set'), a.apply_op('reward_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('action_set'), a.apply_op('action_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('action_set'), a.apply_op('action_set', np.sum)).all()) self.assertTrue( np.equal(a.apply_op('state_set', np.max, axis=-1), np.max(a('state_set'), axis=-1)).all()) tmp_action = a('action_set').copy() a.apply_transformation(set_name='action_set', func=lambda x: x * 2, direct_apply=False) self.assertTrue(np.equal(tmp_action, a('action_set')).all()) a.apply_transformation(set_name='action_set', func=lambda x: x * 2, direct_apply=True) self.assertTrue(np.equal(tmp_action * 2.0, a('action_set')).all()) try: a.apply_transformation(set_name='action_set', func=lambda _: np.array([1, 2, 3]), direct_apply=True) except TransformationResultedToDifferentShapeError as e: pass else: raise TypeError a.apply_transformation(set_name='action_set', func=lambda x: x // 2, direct_apply=True) self.assertTrue(np.equal(tmp_action, a('action_set')).all()) index = np.arange(len(a._internal_data_dict['state_set'][0])).tolist() b = a.get_copy() a.shuffle(index=list(index)) for i in range(len(index)): for key in a._internal_data_dict.keys(): self.assertTrue( np.equal(np.array(a._internal_data_dict[key][0][i]), np.array(b._internal_data_dict[key][0][i])).all()) iterator = a.return_generator() count = 0 for st, new_st, ac, reward, terminal in iterator: count += 1 self.assertTrue(env_spec.action_space.contains(ac)) self.assertTrue(env_spec.obs_space.contains(st)) self.assertTrue(env_spec.obs_space.contains(new_st)) self.assertTrue(np.isscalar(reward)) self.assertTrue(isinstance(terminal, bool)) self.assertEqual(count, 100) count = 0 iter = a.return_generator(batch_size=10) for st, new_st, ac, reward, terminal in iter: self.assertEqual(len(st), 10) self.assertEqual(len(new_st), 10) self.assertEqual(len(ac), 10) self.assertEqual(len(reward), 10) self.assertEqual(len(terminal), 10) count += 1 self.assertEqual(count, 10) count = 0 iter = a.return_generator(batch_size=10, infinite_run=True) for st, new_st, ac, reward, terminal in iter: self.assertEqual(len(st), 10) self.assertEqual(len(new_st), 10) self.assertEqual(len(ac), 10) self.assertEqual(len(reward), 10) self.assertEqual(len(terminal), 10) count += 1 if count > 20: break self.assertGreater(count, 20) a.append_new_set(name='test', data_set=np.ones_like( a._internal_data_dict['state_set'][0]), shape=a._internal_data_dict['state_set'][1]) iter = a.return_generator(batch_size=10, assigned_keys=('state_set', 'new_state_set', 'action_set', 'reward_set', 'done_set', 'test')) count = 0 for st, new_st, ac, reward, terminal, test in iter: self.assertEqual(len(test), 10) count += 1 self.assertEqual(count, 10) a.reset() self.assertEqual(a.reward_set.shape[0], 0) self.assertEqual(a.done_set.shape[0], 0) self.assertEqual(a.action_set.shape[0], 0) self.assertEqual(a.state_set.shape[0], 0) self.assertEqual(a.new_state_set.shape[0], 0) self.assertEqual(a('reward_set').shape[0], 0) self.assertEqual(a('done_set').shape[0], 0) self.assertEqual(a('state_set').shape[0], 0) self.assertEqual(a('new_state_set').shape[0], 0) self.assertEqual(a('action_set').shape[0], 0)
def test_init(self): sess = self.sess env = make('Pendulum-v0') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) action_dim = env_spec.flat_action_dim state_dim = env_spec.flat_obs_dim # bs_shape = tf.placeholder(dtype=tf.int8, shape=[]) bs_shape = 4 action_ph = tf.placeholder(dtype=tf.float32, shape=[None, action_dim]) state_ph = tf.placeholder(dtype=tf.float32, shape=[None, state_dim]) mean_old = tf.layers.dense(inputs=state_ph, name='layer1', units=action_dim) mean2 = tf.layers.dense(inputs=state_ph, name='layer2', units=action_dim) # mean1 = tf.get_variable(name='mean1', shape=[bs_shape, action_dim], dtype=tf.float32) var1 = tf.get_variable(name='var1', shape=[action_dim], dtype=tf.float32, initializer=tf.initializers.random_uniform(0.0, 1.0)) # mean2 = tf.get_variable(name='mean2', shape=[bs_shape, action_dim], dtype=tf.float32) var2 = tf.get_variable(name='var2', shape=[action_dim], dtype=tf.float32, initializer=tf.initializers.random_uniform(0.0, 1.0)) # var1 = tf.get_variable('logvars', (10, action_dim), tf.float32, # tf.constant_initializer(0.0)) # var1 = tf.expand_dims(tf.reduce_sum(var1, axis=0), axis=0) # var1 = tf.tile(var1, [bs_shape, 1]) # # var2 = tf.get_variable('logvars2', (10, action_dim), tf.float32, # tf.constant_initializer(0.0)) # var2 = tf.expand_dims(tf.reduce_sum(var2, axis=0), 0) # var2 = tf.tile(var2, [bs_shape, 1]) dist_old = tfp.distributions.MultivariateNormalDiag(mean_old, tf.sqrt(var1), validate_args=True) dis2 = tfp.distributions.MultivariateNormalDiag(mean2, tf.sqrt(var2), validate_args=True) dist_norm1 = tfp.distributions.Normal(mean_old, var1) dist_norm2 = tfp.distributions.Normal(mean2, var2) print(dist_old, dis2) # dis1 = tfp.distributions.Independent(dis1, reinterpreted_batch_ndims=1) # dis2 = tfp.distributions.Independent(dis2, reinterpreted_batch_ndims=1) # op = tf.train.AdamOptimizer(learning_rate=0.1).minimize(tfp.distributions.kl_divergence(dis1, dis2), # var_list=[mean1, var1]) ac = [env_spec.action_space.sample() for _ in range(bs_shape)] ac = make_batch(np.array(ac), original_shape=env_spec.action_shape) state = [env_spec.obs_space.sample() for _ in range(bs_shape)] state = make_batch(np.array(state), original_shape=env_spec.obs_shape) feed_dict = { state_ph: state, action_ph: ac } sess.run(tf.global_variables_initializer()) kl, entropy, logp, log_p_old = kl_entropy_logprob_from_pat_cody(old_mean=mean_old, old_var=var1, mean=mean2, var=var2, feed_dict=feed_dict, sess=sess, action_ph=action_ph, action_dim=action_dim) kl_tfp = sess.run(tf.reduce_mean(tfp.distributions.kl_divergence(dist_old, dis2)), feed_dict=feed_dict) entropy_tfp = sess.run(tf.reduce_mean(dis2.entropy()), feed_dict=feed_dict) log_prob_tfp = sess.run(dis2.log_prob(value=ac), feed_dict=feed_dict) log_p_old_tfp = sess.run(dist_old.log_prob(value=ac), feed_dict=feed_dict) test_log_prob_tfp = dis2.log_prob(ac) + tf.cast(0.5 * np.log(2. * np.pi * action_dim), dtype=tf.float32) test_log_prob_tfp_old = dist_old.log_prob(ac) + tf.cast(0.5 * np.log(2. * np.pi * action_dim), dtype=tf.float32) print("ac shape {}".format(ac.shape)) print("a sample from dis1 shape {}".format(sess.run(dist_old.sample(), feed_dict=feed_dict).shape)) print("shape of dis under feeddict {}".format( sess.run([dist_old.batch_shape_tensor(), dist_old.event_shape_tensor()], feed_dict=feed_dict))) # print(sess.run(dis2.log_prob(value=ac)).shape) # print(sess.run(dis1.log_prob(value=ac)).shape) for i in range(bs_shape): feed_dict_i = { state_ph: make_batch(state[i], env_spec.obs_shape), action_ph: make_batch(ac[i], env_spec.action_shape) } print("i dis2 log prob: {}".format(sess.run(dis2.log_prob(value=ac[i]), feed_dict=feed_dict_i))) print("i dis1 log prob: {}".format(sess.run(dist_old.log_prob(value=ac[i]), feed_dict=feed_dict_i))) print(kl, kl_tfp) print(entropy, entropy_tfp) print(logp, log_prob_tfp) print(log_p_old, log_p_old_tfp) print('new log p {}'.format(sess.run(test_log_prob_tfp, feed_dict=feed_dict))) print('new log p old {}'.format(sess.run(test_log_prob_tfp_old, feed_dict=feed_dict))) print('new log p norm {}'.format(sess.run(tf.reduce_sum(dist_norm1.log_prob(ac), axis=1), feed_dict=feed_dict))) print('new log p old norm {}'.format( sess.run(tf.reduce_sum(dist_norm2.log_prob(ac), axis=1), feed_dict=feed_dict))) self.assertTrue(np.isclose(logp, log_prob_tfp).all()) self.assertTrue(np.isclose(log_p_old, log_p_old_tfp).all()) self.assertTrue(np.isclose(kl, kl_tfp).all()) self.assertTrue(np.isclose(entropy, entropy_tfp).all()) kl, entropy, logp, log_p_old = kl_entropy_logprob_from_mvn(old_mean=mean_old, old_var=var1, mean=mean2, var=var2, feed_dict=feed_dict, sess=sess, action_ph=action_ph, action_dim=action_dim) print(kl, entropy, logp, log_p_old) self.assertTrue(np.isclose(logp, log_prob_tfp).all()) self.assertTrue(np.isclose(log_p_old, log_p_old_tfp).all()) self.assertTrue(np.isclose(entropy, entropy_tfp).all()) self.assertTrue(np.isclose(kl, kl_tfp).all())
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel'] ) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=exp_config['MPC'], name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy') ) algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](), terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), ) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': {'func': agent.train, 'args': list(), 'kwargs': dict()}, 'train_algo': None, 'test_algo': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True)}, 'test_dynamics': {'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env)}, 'sample_from_real_env': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True)}, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None } ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, Q_NET_L1_NORM_SCALE=0.001, Q_NET_L2_NORM_SCALE=0.001, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent( env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): # create the gym environment by make function env = make('Pendulum-v0') # give your experiment a name which is used to generate the log path etc. name = 'demo_exp' # construct the environment specification env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) # construct the neural network to approximate q function of DDPG mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # construct the neural network to approximate policy for DDPG policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) # construct the DDPG algorithms ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) # construct a neural network based global dynamics model to approximate the state transition of environment mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, state_input_scaler=RunningStandardScaler(dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # finally, construct the Dyna algorithms with a model free algorithm DDGP, and a NN model. algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # To make the NN based dynamics model a proper environment so be a sampling source for DDPG, reward function and # terminal function need to be set. # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) # construct agent with additional exploration strategy if needed. agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # construct the training flow, called Dyna flow. It defines how the training proceed, and the terminal condition flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=1)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), train_algo_every_real_sample_count_by_data_from_real_env=40, train_algo_every_real_sample_count_by_data_from_dynamics_env=40, test_algo_every_real_sample_count=40, test_dynamics_every_real_sample_count=40, train_dynamics_ever_real_sample_count=20, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=1) # construct the experiment experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') # run! experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = env.env_spec mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow(test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def pendulum_task_fn(): GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel']) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training', train_iter=1) }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=50, sample_type='transition', env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=False) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
from baconian.core.core import EnvSpec from baconian.envs.gym_env import make import numpy as np env = make("HalfCheetah-v2") env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) OBS_DIM = env_spec.flat_obs_dim HID1_SIZE = 400 HID2_SIZE = 300 POLICY_HID_MULTI = 10 ACT_DIM = env_spec.flat_action_dim POLICY_HID1_SIZE = 400 POLICY_HID2_SIZE = 300 CHEETAH_BENCHMARK_CONFIG_DICT = { 'env_id': "HalfCheetah-v2", 'MLP_V': { 'mlp_config': [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": HID1_SIZE, "TYPE": "DENSE", "W_NORMAL_STDDEV": np.sqrt(1 / OBS_DIM) }, { "ACT": "RELU", "B_INIT_VALUE": 0.0,
def mountiancar_task_fn(): exp_config = MOUNTAIN_CAR_CONTINUOUS_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('MountainCarContinuous-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) n_actions = env.action_space.shape[0] agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=OrnsteinUhlenbeckActionNoise( mu=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)), noise_weight_scheduler=ConstantScheduler(value=1), action_weight_scheduler=ConstantScheduler(value=1.0)), reset_noise_every_terminal_state=True, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT']), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def test_func(self): env = make('Pendulum-v0') env.reset() env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name='mlp_policy', name_scope='mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], output_high=None, output_low=None, output_norm=None, input_norm=None, reuse=False) policy.init() print( policy.compute_dist_info(name='entropy', feed_dict={ policy.state_input: make_batch( env_spec.obs_space.sample(), original_shape=env_spec.obs_shape) })) print( policy.compute_dist_info(name='prob', value=env_spec.action_space.sample(), feed_dict={ policy.state_input: make_batch( env_spec.obs_space.sample(), original_shape=env_spec.obs_shape) })) new_policy = policy.make_copy( reuse=False, name='new_p', name_scope='mlp_policy_2', ) new_policy.init() for var1, var2 in zip(policy.parameters('tf_var_list'), new_policy.parameters('tf_var_list')): print(var1.name) print(var2.name) self.assertNotEqual(var1.name, var2.name) self.assertNotEqual(id(var1), id(var2)) obs1 = make_batch( env_spec.obs_space.sample(), original_shape=env_spec.obs_shape, ) obs2 = make_batch(env_spec.obs_space.sample(), original_shape=env_spec.obs_shape) kl1 = policy.compute_dist_info(name='kl', other=new_policy, feed_dict={ policy.state_input: obs1, new_policy.state_input: obs2 }) kl2 = self.sess.run(policy.kl(other=new_policy), feed_dict={ policy.state_input: obs1, new_policy.state_input: obs2 }) self.assertTrue(np.isclose(kl1, kl2).all())
def task_fn(): name = 'mpc_ModifiedHalfCheetah' env = make('ModifiedHalfCheetah') env_spec = env.env_spec mlp_dyna = MBMPC_MLPDynamics(env_spec=env.env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=1e-3, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # buffer rl_size = 500 # default 1000 random_size = 500 # default 1000 ### algo horizon = 20 dyna_epoch = 60 ### agent max_step = 500 # default 1000 # TODO: 9.22 should max_step == rl_size == random_size? batch_size = 128 rand_rl_ratio = 0.1 random_trajectory = 1 # TODO: 9.22 Is there situations when tranjectory num must != 1 on_policy_trajectory = 1 on_policy_iter = 10 num_simulated_paths = 50 # default 1000 algo = ModelBasedModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=horizon, SAMPLED_PATH_NUM=num_simulated_paths, dynamics_model_train_iter=dyna_epoch), name=name + '_algo', policy=UniformRandomPolicy(env_spec=env_spec, name='uniform_random')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=MBMPC_HalfCheetah_CostFunc(name='cost_fn'), terminal_func=MBMPC_HalfCheetah_TerminalFunc(name='terminal_fn')) agent = MB_MPC_Agent(name=name + '_agent', env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, algo_saving_scheduler=None) flow = create_train_test_flow(env=env, env_spec=env_spec, rl_size=rl_size, max_step=max_step, batch_size=batch_size, random_size=random_size, rand_rl_ratio=rand_rl_ratio, train_iter=dyna_epoch, on_policy_iter=on_policy_iter, random_trajectory=random_trajectory, on_policy_trajectory=on_policy_trajectory, num_simulated_paths=num_simulated_paths, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict()), sample_func_and_args=(agent.sample, (), dict()), train_every_sample_count=None, test_every_sample_count=None, start_train_after_sample_count=None, start_test_after_sample_count=None, flow_type='MBMPC_TrainFlow') experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, Q_NET_L1_NORM_SCALE=0.001, Q_NET_L2_NORM_SCALE=0.001, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent( env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def test_standard_scaler(self): for env in (make('Pendulum-v0'), make('Acrobot-v1'), make('RoboschoolAnt-v1')): for sample_space in (env.observation_space, env.action_space): sample_fn = sample_space.sample dims = sample_space.flat_dim try: # test batch standard scaler standard_scaler = BatchStandardScaler(dims=dims) data_list = [] for i in range(100): data_list.append(sample_fn()) data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) data = standard_scaler.inverse_process(data) self.assertTrue( np.isclose(data, np.array(data_list)).all()) # test running standard scaler standard_scaler = RunningStandardScaler(dims=dims) data_list = [] for i in range(100): data_list.append(sample_fn()) standard_scaler.update_scaler(np.array(data_list)) self.assertEqual(standard_scaler._data_count, 100) data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) # test update function new_data_list = [] for i in range(100): new_data_list.append(sample_fn()) standard_scaler.update_scaler(np.array(new_data_list)) self.assertEqual(standard_scaler._data_count, 200) data_list += new_data_list data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) # test running scaler with given data data_list = [] for i in range(100): data_list.append(sample_fn()) standard_scaler = RunningStandardScaler( dims=dims, init_data=np.array(data_list)) self.assertEqual(standard_scaler._data_count, 100) data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) # test update of running scaler with given data new_data_list = [] for i in range(100): new_data_list.append(sample_fn()) standard_scaler.update_scaler(np.array(new_data_list)) self.assertEqual(standard_scaler._data_count, 200) data_list += new_data_list data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) # test running scaler with given initial mean, var. data_list = [] for i in range(100): data_list.append(sample_fn()) standard_scaler = RunningStandardScaler( dims=dims, init_mean=np.mean(data_list, axis=0), init_var=np.var(data_list, axis=0), init_mean_var_data_count=100) self.assertEqual(standard_scaler._data_count, 100) data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) new_data_list = [] for i in range(100): new_data_list.append(sample_fn()) standard_scaler.update_scaler(np.array(new_data_list)) self.assertEqual(standard_scaler._data_count, 200) data_list += new_data_list data = standard_scaler.process(np.array(data_list)) self.assertTrue( np.isclose(np.mean(data, axis=0), 0.0).all()) # TODO a theoretical bound should be given self.assertTrue( np.isclose(np.var(data, axis=0), 1.0, atol=0.04).all()) except ShapeNotCompatibleError as e: from baconian.common.spaces import Box if isinstance(sample_space, Box): raise ValueError else: pass
def inverted_pendulum_task_fn(): exp_config = INVERTED_PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('InvertedPendulum-v2') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', **exp_config['MLP_V']) policy = NormalDistributionMLPPolicy( env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', **exp_config['POLICY'], output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, reuse=False) ppo = PPO(env_spec=env_spec, **exp_config['PPO'], value_func=mlp_v, stochastic_policy=policy, name=name + '_ppo') agent = Agent(env=env, env_spec=env_spec, algo=ppo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_FUNC_COUNT'), config_or_config_dict=exp_config['TrainTestFlow'] ['config_or_config_dict'], func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TEST_SAMPLES_COUNT'], sample_trajectory_flag=True), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow'] ['TRAIN_SAMPLES_COUNT'], env=agent.env, sample_type='trajectory', in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def test_min_max(self): for env in (make('Pendulum-v0'), make('Acrobot-v1'), make('RoboschoolAnt-v1')): for sample_space in (env.observation_space, env.action_space): sample_fn = sample_space.sample dims = sample_space.flat_dim try: print("test {} with sample {} dims {}".format( env, sample_fn, dims)) # test batch scaler min_max = BatchMinMaxScaler(dims=dims) data_list = [] for i in range(100): data_list.append(sample_fn()) data = min_max.process(np.array(data_list)) self.assertTrue( np.greater_equal(np.ones(dims), data).all()) self.assertTrue(np.less_equal(np.zeros(dims), data).all()) # test batch scaler with given range min_max = BatchMinMaxScaler( dims=dims, desired_range=(np.ones(dims) * -1.0, np.ones(dims) * 5.0)) data_list = [] for i in range(100): data_list.append(sample_fn()) data = min_max.process(np.array(data_list)) self.assertTrue( np.greater_equal(np.ones(dims) * 5.0, data).all()) self.assertTrue( np.less_equal(np.ones(dims) * -1.0, data).all()) self.assertEqual(np.max(data), 5.0) self.assertEqual(np.min(data), -1.0) data = min_max.inverse_process(data) self.assertTrue( np.isclose(data, np.array(data_list)).all()) # test batch scaler with given range and given initial data data_list = [] for i in range(100): data_list.append(sample_fn()) min_max = RunningMinMaxScaler( dims=dims, desired_range=(np.ones(dims) * -1.0, np.ones(dims) * 5.0), init_data=np.array(data_list)) data = min_max.process(np.array(data_list)) self.assertTrue( np.greater_equal(np.ones(dims) * 5.0, data).all()) self.assertTrue( np.less_equal(np.ones(dims) * -1.0, data).all()) self.assertEqual(np.max(data), 5.0) self.assertEqual(np.min(data), -1.0) # test batch scaler with given range and given initial min and max data_list = [] for i in range(100): data_list.append(sample_fn()) min_max = RunningMinMaxScaler( dims=dims, desired_range=(np.ones(dims) * -1.0, np.ones(dims) * 5.0), init_min=np.min(np.array(data_list), axis=0), init_max=np.max(np.array(data_list), axis=0)) data = min_max.process(np.array(data_list)) self.assertTrue( np.greater_equal(np.ones(dims) * 5.0, data).all()) self.assertTrue( np.less_equal(np.ones(dims) * -1.0, data).all()) self.assertEqual(np.max(data), 5.0) self.assertEqual(np.min(data), -1.0) # test update function by a larger range of data pre_min = np.min(np.array(data_list), axis=0) pre_max = np.max(np.array(data_list), axis=0) data_list = np.array(data_list) * 2.0 min_max.update_scaler(data_list) self.assertTrue( np.equal(pre_min * 2.0, min_max._min).all()) self.assertTrue( np.equal(pre_max * 2.0, min_max._max).all()) except ShapeNotCompatibleError as e: from baconian.common.spaces import Box if isinstance(sample_space, Box): raise ValueError else: pass
def test_transition_data(self): env = make('Acrobot-v1') env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) a = TransitionData(env_spec) st = env.reset() for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) self.assertEqual(a.reward_set.shape[0], 100) self.assertEqual(a.done_set.shape[0], 100) self.assertEqual(a.action_set.shape[0], 100) self.assertEqual(a.state_set.shape[0], 100) self.assertEqual(a.new_state_set.shape[0], 100) self.assertEqual(a('reward_set').shape[0], 100) self.assertEqual(a('done_set').shape[0], 100) self.assertEqual(a('state_set').shape[0], 100) self.assertEqual(a('new_state_set').shape[0], 100) self.assertEqual(a('action_set').shape[0], 100) a = TransitionData( obs_shape=list(np.array(env_spec.obs_space.sample()).shape), action_shape=list(np.array(env_spec.action_space.sample()).shape)) st = env.reset() for i in range(100): ac = env_spec.action_space.sample() st_new, re, done, _ = env.step(action=ac) a.append(state=st, new_state=st_new, action=ac, done=done, reward=re) self.assertEqual(a.reward_set.shape[0], 100) self.assertEqual(a.done_set.shape[0], 100) self.assertEqual(a.action_set.shape[0], 100) self.assertEqual(a.state_set.shape[0], 100) self.assertEqual(a.new_state_set.shape[0], 100) self.assertEqual(a('reward_set').shape[0], 100) self.assertEqual(a('done_set').shape[0], 100) self.assertEqual(a('state_set').shape[0], 100) self.assertEqual(a('new_state_set').shape[0], 100) self.assertEqual(a('action_set').shape[0], 100) self.assertTrue( np.equal(a.get_mean_of('state_set'), a.apply_op('state_set', np.mean)).all()) self.assertTrue( np.equal(a.get_sum_of('state_set'), a.apply_op('state_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('reward_set'), a.apply_op('reward_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('reward_set'), a.apply_op('reward_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('action_set'), a.apply_op('action_set', np.sum)).all()) self.assertTrue( np.equal(a.get_sum_of('action_set'), a.apply_op('action_set', np.sum)).all()) self.assertTrue( np.equal(a.apply_op('state_set', np.max, axis=-1), np.max(a('state_set'), axis=-1)).all()) tmp_action = a('action_set').copy() a.apply_transformation(set_name='action_set', func=lambda x: x * 2, direct_apply=False) self.assertTrue(np.equal(tmp_action, a('action_set')).all()) a.apply_transformation(set_name='action_set', func=lambda x: x * 2, direct_apply=True) self.assertTrue(np.equal(tmp_action * 2.0, a('action_set')).all()) try: a.apply_transformation(set_name='action_set', func=lambda _: np.array([1, 2, 3]), direct_apply=True) except TransformationResultedToDifferentShapeError as e: pass else: raise TypeError a.apply_transformation(set_name='action_set', func=lambda x: x // 2, direct_apply=True) self.assertTrue(np.equal(tmp_action, a('action_set')).all()) index = np.arange(len(a._internal_data_dict['state_set'][0])).tolist() b = a.get_copy() a.shuffle(index=list(index)) for i in range(len(index)): for key in a._internal_data_dict.keys(): self.assertTrue( np.equal(np.array(a._internal_data_dict[key][0][i]), np.array(b._internal_data_dict[key][0][i])).all()) a.append_new_set(name='test', data_set=np.ones_like( a._internal_data_dict['state_set'][0]), shape=a._internal_data_dict['state_set'][1]) a.reset() self.assertEqual(a.reward_set.shape[0], 0) self.assertEqual(a.done_set.shape[0], 0) self.assertEqual(a.action_set.shape[0], 0) self.assertEqual(a.state_set.shape[0], 0) self.assertEqual(a.new_state_set.shape[0], 0) self.assertEqual(a('reward_set').shape[0], 0) self.assertEqual(a('done_set').shape[0], 0) self.assertEqual(a('state_set').shape[0], 0) self.assertEqual(a('new_state_set').shape[0], 0) self.assertEqual(a('action_set').shape[0], 0)
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "Q_NET_L1_NORM_SCALE": 0.01, "Q_NET_L2_NORM_SCALE": 0.01, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10, "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10, "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10, "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "WARM_UP_DYNAMICS_SAMPLES": 1 }, func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=True) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG( env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG'] ) agent = Agent(env=env, env_spec=env_spec, algo=ddpg, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper(noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = TrainTestFlow(train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['TrainTestFlow']['config_or_config_dict'], func_dict={ 'test': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TEST_SAMPLES_COUNT'], sample_trajectory_flag=True), }, 'train': {'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=exp_config['TrainTestFlow']['TRAIN_SAMPLES_COUNT'], env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna_list = [] for i in range(10): mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna_{}'.format(i), name=name + '_mlp_dyna_{}'.format(i), learning_rate=0.01, state_input_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), action_input_scaler=RunningStandardScaler( dims=env_spec.flat_action_dim), output_delta_state_scaler=RunningStandardScaler( dims=env_spec.flat_obs_dim), mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) mlp_dyna_list.append(mlp_dyna) dyna_ensemble_model = ModelEnsemble(n_models=10, model=mlp_dyna_list, prediction_type='random', env_spec=env_spec) algo = ModelEnsembleAlgo(env_spec=env_spec, model_free_algo=ddpg, dynamics_model=dyna_ensemble_model, config_or_config_dict=dict( dynamics_model_train_iter=10, model_free_algo_train_iter=10, validation_trajectory_count=2, )) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=PendulumRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=200, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) # we can easily reuse the dyna training flow to implement the Model-ensemble training flow. flow = create_dyna_flow( train_algo_func=(agent.train, (), dict(state='state_agent_training')), train_algo_from_synthesized_data_func=( agent.train, (), dict(state='state_agent_training')), train_dynamics_func=(agent.train, (), dict(state='state_dynamics_training')), test_algo_func=(agent.test, (), dict(sample_count=10)), test_dynamics_func=(agent.algo.test_dynamics, (), dict(sample_count=10, env=env)), sample_from_real_env_func=(agent.sample, (), dict(sample_count=10, env=agent.env, store_flag=True)), sample_from_dynamics_env_func=(agent.sample, (), dict(sample_count=10, env=agent.algo.dynamics_env, store_flag=True)), # set this to large enough so agent only use data from dynamics env. train_algo_every_real_sample_count_by_data_from_real_env=100, train_algo_every_real_sample_count_by_data_from_dynamics_env=100, test_algo_every_real_sample_count=100, test_dynamics_every_real_sample_count=100, train_dynamics_ever_real_sample_count=100, start_train_algo_after_sample_count=1, start_train_dynamics_after_sample_count=1, start_test_algo_after_sample_count=1, start_test_dynamics_after_sample_count=1, warm_up_dynamics_samples=100) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'mb_test' env_spec = env.env_spec model_path = '/home/yitongx/Documents/baconian-project/experiments/log' cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \ model_path=model_path) mlp_config = [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 32, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 8, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }] mlp_q = MLPQValueFunction(env_spec=env_spec, name=name + '_mlp_q', name_scope=name + '_mlp_q', output_high=env.action_space.high, mlp_config=mlp_config) mlp_policy = DeterministicMLPPolicy(env_spec=env_spec, name=name + '_mlp_policy', name_scope=name + '_mlp_policy', output_high=env.observation_space.high, mlp_config=mlp_config, reuse=False) polyak = 0.995 gamma = 0.99 noise_scale = 0.5 noise_decay = 0.999 # default 0.995 batch_size = 128 actor_lr = 0.001 # default 0.001 critic_lr = 0.001 # default 0.001 buffer_size = 100000 total_steps = 500000 # default 1000000 max_step_per_episode = 500 # reset env when counter > max_step_per_episode train_after_step = 10000 # default 10000 train_every_step = 1 train_iter_per_call = 1 test_after_step = 10000 test_every_step = 1000 num_test = 10 algo = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": buffer_size, "GAMMA": gamma, "CRITIC_LEARNING_RATE": critic_lr, "ACTOR_LEARNING_RATE": actor_lr, "DECAY": polyak, "BATCH_SIZE": batch_size, "TRAIN_ITERATION": train_iter_per_call, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=mlp_policy, name=name + '_ddpg', replay_buffer=None) step_counter = SinglentonStepCounter(-1) noise_adder = AgentActionNoiseWrapper( noise=UniformNoise(scale=noise_scale), action_weight_scheduler=ConstantScheduler(1.), noise_weight_scheduler=DDPGNoiseScheduler( train_every_step=train_every_step, train_after_step=train_after_step, noise_decay=noise_decay, step_counter=step_counter)) agent = DDPG_Agent(env=env, algo=algo, env_spec=env_spec, noise_adder=noise_adder, name=name + '_agent') flow = create_train_test_flow(env=env, cyber=cyber, agent=agent, num_test=num_test, total_steps=total_steps, max_step_per_episode=max_step_per_episode, train_after_step=train_after_step, test_after_step=test_after_step, train_every_step=train_every_step, test_every_step=test_every_step, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict()), sample_func_and_args=(agent.sample, (), dict()), flow_type='DDPG_TrainTestFlow') experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
""" This gives a simple example on how to use Gaussian Process (GP) to approximate the Gym environment Pendulum-v0 We use gpflow package to build the Gaussian Process. """ from baconian.core.core import EnvSpec from baconian.envs.gym_env import make import numpy as np from baconian.common.sampler.sample_data import TransitionData from baconian.algo.rl.policy.random_policy import UniformRandomPolicy from baconian.algo.dynamics.gaussian_process_dynamiocs_model import GaussianProcessDyanmicsModel from baconian.algo.dynamics.dynamics_model import DynamicsEnvWrapper from baconian.algo.dynamics.terminal_func.terminal_func import RandomTerminalFunc from baconian.algo.dynamics.reward_func.reward_func import RandomRewardFunc env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train GP model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train()