def create_dyna_flow(self, agent, env): flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10, "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10, "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10, "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "WARM_UP_DYNAMICS_SAMPLES": 1 }, func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=True) } }) return flow, locals()
def pendulum_task_fn(): GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', **exp_config['MLPQValueFunction']) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', output_low=env_spec.action_space.low, output_high=env_spec.action_space.high, **exp_config['DeterministicMLPPolicy'], reuse=False) ddpg = DDPG(env_spec=env_spec, policy=policy, value_func=mlp_q, name=name + '_ddpg', **exp_config['DDPG']) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel']) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=AgentActionNoiseWrapper( noise=NormalActionNoise(), noise_weight_scheduler=ConstantSchedule(value=0.3), action_weight_scheduler=ConstantSchedule(value=1.0)), name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training', train_iter=1) }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=50, sample_type='transition', env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=False) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "Q_NET_L1_NORM_SCALE": 0.01, "Q_NET_L2_NORM_SCALE": 0.01, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, learning_rate=0.01, mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = Dyna(env_spec=env_spec, name=name + '_dyna_algo', model_free_algo=ddpg, dynamics_model=mlp_dyna, config_or_config_dict=dict(dynamics_model_train_iter=10, model_free_algo_train_iter=10)) # For examples only, we use random reward function and terminal function with fixed episode length. algo.set_terminal_reward_function_for_dynamics_env( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), reward_func=RandomRewardFunc()) agent = Agent( env=env, env_spec=env_spec, algo=algo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict={ "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10, "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10, "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10, "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10, "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1, "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1, "WARM_UP_DYNAMICS_SAMPLES": 1 }, func_dict={ 'train_algo': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_algo_from_synthesized_data': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_agent_training') }, 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=10) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=10, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.algo.dynamics_env, in_which_status='TRAIN', store_flag=True) } }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name + '_exp') experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', **exp_config['DynamicsModel'] ) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=exp_config['MPC'], name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy') ) algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](), terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=algo.dynamics_env.total_step_count_fn), ) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': {'func': agent.train, 'args': list(), 'kwargs': dict()}, 'train_algo': None, 'test_algo': {'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True)}, 'test_dynamics': {'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env)}, 'sample_from_real_env': {'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True)}, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None } ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def pendulum_task_fn(): exp_config = PENDULUM_BENCHMARK_CONFIG_DICT GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT', exp_config['DEFAULT_EXPERIMENT_END_POINT']) env = make('Pendulum-v0') name = 'benchmark' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_dyna = ContinuousMLPGlobalDynamicsModel( env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', output_low=env_spec.obs_space.low, output_high=env_spec.obs_space.high, **exp_config['DynamicsModel']) dyna_env = DynamicsEnvWrapper(mlp_dyna) dyna_env.set_terminal_reward_func( terminal_func=FixedEpisodeLengthTerminalFunc( max_step_length=env.unwrapped._max_episode_steps, step_count_fn=dyna_env.total_step_count_fn), reward_func=REWARD_FUNC_DICT['Pendulum-v0']()) policy = iLQRPolicy(env_spec=env_spec, **exp_config['ILQR'], dynamics=dyna_env, cost_fn=RewardFuncCostWrapper( reward_func=REWARD_FUNC_DICT['Pendulum-v0']())) algo = iLQRAlogWrapper(policy=policy, env_spec=env_spec, dynamics_env=dyna_env) agent = Agent(env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, noise_adder=None, name=name + '_agent') flow = DynaFlow( train_sample_count_func=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), config_or_config_dict=exp_config['DynaFlow'], func_dict={ 'train_dynamics': { 'func': agent.train, 'args': list(), 'kwargs': dict(state='state_dynamics_training') }, 'train_algo': None, 'test_algo': { 'func': agent.test, 'args': list(), 'kwargs': dict(sample_count=1, sample_trajectory_flag=True) }, 'test_dynamics': { 'func': agent.algo.test_dynamics, 'args': list(), 'kwargs': dict(sample_count=100, env=env) }, 'sample_from_real_env': { 'func': agent.sample, 'args': list(), 'kwargs': dict(sample_count=10, env=agent.env, in_which_status='TRAIN', store_flag=True) }, 'sample_from_dynamics_env': None, 'train_algo_from_synthesized_data': None }) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()