def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel']) dyna_env = DynamicsEnvWrapper(mlp_dyna) dyna_env.set_terminal_reward_func( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=dyna_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) policy = iLQRPolicy(env_spec=env_spec, **exp_config['ILQR'], dynamics=dyna_env, cost_fn=RewardFuncCostWrapper( reward_func=REWARD_FUNC_DICT['Pendulum-v0']())) algo = iLQRAlogWrapper(policy=policy, env_spec=env_spec, dynamics_env=dyna_env) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'train_algo': None, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
action_space=env.action_space) data = TransitionData(env_spec=env_spec) policy = UniformRandomPolicy(env_spec=env_spec) # Do some initial sampling here to train GP model st = env.reset() for i in range(100): ac = policy.forward(st) new_st, re, _, _ = env.step(ac) data.append(state=st, new_state=new_st, action=ac, reward=re, done=False) st = new_st gp = GaussianProcessDyanmicsModel(env_spec=env_spec, batch_data=data) gp.init() gp.train() dyna_env = DynamicsEnvWrapper(dynamics=gp) # Since we only care about the prediction here, so we pass the terminal function and reward function setting with # random one dyna_env.set_terminal_reward_func(terminal_func=RandomTerminalFunc(), reward_func=RandomRewardFunc()) st = env.reset() real_state_list = [] dynamics_state_list = [] test_sample_count = 100 for i in range(test_sample_count): ac = env_spec.action_space.sample() gp.reset_state(state=st) new_state_dynamics, _, _, _ = dyna_env.step(action=ac, allow_clip=True) new_state_real, _, done, _ = env.step(action=ac) real_state_list.append(new_state_real)
def return_as_env(self) -> Env: return DynamicsEnvWrapper(dynamics=self, name=self._name + '_env')