def create_mpc(self, env_id='Acrobot-v1', name='mpc', policy=None, mlp_dyna=None, env_spec=None, env=None): if mlp_dyna is None: mlp_dyna, local = self.create_continue_dynamics_model( env_id, name + 'mlp_dyna') env_spec = local['env_spec'] env = local['env'] policy = policy if policy else UniformRandomPolicy(env_spec=env_spec, name='unp') algo = ModelPredictiveControl(dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict( SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name, policy=policy) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=RandomTerminalFunc(name='random_p'), reward_func=RandomRewardFunc('re_fun')) return algo, locals()
def test_correctness(self): env_id = 'Pendulum-v0' env = make(env_id) env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) dyna = DebugDynamics(env_spec=env_spec) dyna = DynamicsEnvWrapper(dynamics=dyna) dyna.set_terminal_reward_func(terminal_func=RandomTerminalFunc(), reward_func=DebuggingCostFunc()) policy = iLQRPolicy(env_spec=env_spec, T=10, delta=0.05, iteration=2, dynamics=dyna, dynamics_model_train_iter=10, cost_fn=DebuggingCostFunc()) st = env.reset() dyna.st = np.zeros_like(st) for i in range(10): ac = policy.forward(st) st, _, _, _ = env.step(st) # st = dyna.step(action=ac, state=st) print("analytical optimal action -0.5, cost -0.25") print('state: {}, action: {}, cost {}'.format( st, ac, policy.iLqr_instance.cost_fn(state=st, action=ac, new_state=None)))
def create_dyna(self, env_spec=None, model_free_algo=None, dyanmics_model=None, name='dyna'): if not env_spec: model_free_algo, local = self.create_ddpg() dyanmics_model, _ = self.create_continuous_mlp_global_dynamics_model( env_spec=local['env_spec']) env_spec = local['env_spec'] env = local['env'] algo = Dyna(env_spec=env_spec, name=name, model_free_algo=model_free_algo, dynamics_model=dyanmics_model, config_or_config_dict=dict(dynamics_model_train_iter=1, model_free_algo_train_iter=1)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=RandomTerminalFunc(), reward_func=RandomRewardFunc()) return algo, locals()
# Do some initial sampling here to train GP model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() dyna_env = DynamicsEnvWrapper(dynamics=gp) # Since we only care about the prediction here, so we pass the terminal function and reward function setting with # random one dyna_env.set_terminal_reward_func(terminal_func=RandomTerminalFunc(), reward_func=RandomRewardFunc()) st = env.reset() real_state_list = [] dynamics_state_list = [] test_sample_count = 100 for i in range(test_sample_count): ac = env_spec.action_space.sample() gp.reset_state(state=st) new_state_dynamics, _, _, _ = dyna_env.step(action=ac, allow_clip=True) new_state_real, _, done, _ = env.step(action=ac) real_state_list.append(new_state_real) dynamics_state_list.append(new_state_dynamics) st = new_state_real if done is True:
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = TrainTestFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_EVERY_SAMPLE_COUNT": 10, "TRAIN_EVERY_SAMPLE_COUNT": 10, "START_TRAIN_AFTER_SAMPLE_COUNT": 5, "START_TEST_AFTER_SAMPLE_COUNT": 5, }, func_dict={ 'test': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10), }, 'train': { 'func': agent.train, 'args': list(), 'kwargs': dict(), }, 'sample': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True), }, }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = env.env_spec mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow(test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()