def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = DeterministicMLPPolicy(env_spec=env_spec, name_scope=name + '_mlp_policy', name=name + '_mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ddpg = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": 10000, "GAMMA": 0.999, "CRITIC_LEARNING_RATE": 0.001, "ACTOR_LEARNING_RATE": 0.001, "DECAY": 0.5, "BATCH_SIZE": 50, "TRAIN_ITERATION": 1, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=policy, name=name + '_ddpg', replay_buffer=None) agent = Agent( env=env, env_spec=env_spec, algo=ddpg, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_v = MLPVValueFunc(env_spec=env_spec, name_scope=name + 'mlp_v', name=name + 'mlp_v', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "L1_NORM": 0.01, "L2_NORM": 0.01, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) policy = NormalDistributionMLPPolicy(env_spec=env_spec, name_scope=name + 'mlp_policy', name=name + 'mlp_policy', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.01, "L2_NORM": 0.01, "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": env_spec.flat_action_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }], reuse=False) ppo = PPO(env_spec=env_spec, config_or_config_dict={ "gamma": 0.995, "lam": 0.98, "policy_train_iter": 10, "value_func_train_iter": 10, "clipping_range": None, "beta": 1.0, "eta": 50, "log_var_init": -1.0, "kl_target": 0.003, "policy_lr": 0.01, "value_func_lr": 0.01, "value_func_train_batch_size": 10, "lr_multiplier": 1.0 }, value_func=mlp_v, stochastic_policy=policy, name=name + 'ppo') agent = Agent( env=env, env_spec=env_spec, algo=ppo, algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect() ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), name=name + 'agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'example_scheduler_' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', algo_saving_scheduler=PeriodicalEventSchedule( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), trigger_every_step=20, after_t=10), exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=PiecewiseScheduler( t_fn=lambda: get_global_status_collect()( 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), endpoints=((10, 0.3), (100, 0.1), (200, 0.0)), outside_value=0.0 ), init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True)) ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name + 'experiment_debug' ) dqn.parameters.set_scheduler(param_key='LEARNING_RATE', scheduler=LinearScheduler( t_fn=experiment.TOTAL_AGENT_TRAIN_SAMPLE_COUNT, schedule_timesteps=GlobalConfig().DEFAULT_EXPERIMENT_END_POINT[ 'TOTAL_AGENT_TRAIN_SAMPLE_COUNT'], final_p=0.0001, initial_p=0.01)) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env.env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) env_spec = env.env_spec mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[ { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 256, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 } ]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=50000, GAMMA=0.99, BATCH_SIZE=32, LEARNING_RATE=0.001, TRAIN_ITERATION=1, DECAY=0), name=name + '_dqn', value_func=mlp_q) epsilon_greedy = EpsilonGreedy(action_space=env_spec.action_space, prob_scheduler=LinearScheduler( t_fn=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'), schedule_timesteps=int(0.1 * 100000), initial_p=1.0, final_p=0.02), init_random_prob=0.1) agent = Agent(env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=epsilon_greedy, noise_adder=None) flow = create_train_test_flow( test_every_sample_count=1000, train_every_sample_count=1, start_test_after_sample_count=0, start_train_after_sample_count=10000, sample_func_and_args=(agent.sample, (), dict(sample_count=1, env=agent.env, store_flag=True)), train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=1)), ) experiment = Experiment( tuner=None, env=env, agent=agent, flow=flow, name=name ) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'mb_test' env_spec = env.env_spec model_path = '/home/yitongx/Documents/baconian-project/experiments/log' cyber = PendulumnCyber(env=env, epoch_to_use=60, use_traj_input=False, use_mbmf=True, \ model_path=model_path) mlp_config = [{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 32, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "2", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "3", "N_UNITS": 8, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }] mlp_q = MLPQValueFunction(env_spec=env_spec, name=name + '_mlp_q', name_scope=name + '_mlp_q', output_high=env.action_space.high, mlp_config=mlp_config) mlp_policy = DeterministicMLPPolicy(env_spec=env_spec, name=name + '_mlp_policy', name_scope=name + '_mlp_policy', output_high=env.observation_space.high, mlp_config=mlp_config, reuse=False) polyak = 0.995 gamma = 0.99 noise_scale = 0.5 noise_decay = 0.999 # default 0.995 batch_size = 128 actor_lr = 0.001 # default 0.001 critic_lr = 0.001 # default 0.001 buffer_size = 100000 total_steps = 500000 # default 1000000 max_step_per_episode = 500 # reset env when counter > max_step_per_episode train_after_step = 10000 # default 10000 train_every_step = 1 train_iter_per_call = 1 test_after_step = 10000 test_every_step = 1000 num_test = 10 algo = DDPG(env_spec=env_spec, config_or_config_dict={ "REPLAY_BUFFER_SIZE": buffer_size, "GAMMA": gamma, "CRITIC_LEARNING_RATE": critic_lr, "ACTOR_LEARNING_RATE": actor_lr, "DECAY": polyak, "BATCH_SIZE": batch_size, "TRAIN_ITERATION": train_iter_per_call, "critic_clip_norm": 0.1, "actor_clip_norm": 0.1, }, value_func=mlp_q, policy=mlp_policy, name=name + '_ddpg', replay_buffer=None) step_counter = SinglentonStepCounter(-1) noise_adder = AgentActionNoiseWrapper( noise=UniformNoise(scale=noise_scale), action_weight_scheduler=ConstantScheduler(1.), noise_weight_scheduler=DDPGNoiseScheduler( train_every_step=train_every_step, train_after_step=train_after_step, noise_decay=noise_decay, step_counter=step_counter)) agent = DDPG_Agent(env=env, algo=algo, env_spec=env_spec, noise_adder=noise_adder, name=name + '_agent') flow = create_train_test_flow(env=env, cyber=cyber, agent=agent, num_test=num_test, total_steps=total_steps, max_step_per_episode=max_step_per_episode, train_after_step=train_after_step, test_after_step=test_after_step, train_every_step=train_every_step, test_every_step=test_every_step, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict()), sample_func_and_args=(agent.sample, (), dict()), flow_type='DDPG_TrainTestFlow') experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Acrobot-v1') name = 'demo_exp' env_spec = EnvSpec(obs_space=env.observation_space, action_space=env.action_space) mlp_q = MLPQValueFunction(env_spec=env_spec, name_scope=name + '_mlp_q', name=name + '_mlp_q', mlp_config=[{ "ACT": "RELU", "B_INIT_VALUE": 0.0, "NAME": "1", "N_UNITS": 16, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "N_UNITS": 1, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) dqn = DQN(env_spec=env_spec, config_or_config_dict=dict(REPLAY_BUFFER_SIZE=1000, GAMMA=0.99, BATCH_SIZE=10, Q_NET_L1_NORM_SCALE=0.001, Q_NET_L2_NORM_SCALE=0.001, LEARNING_RATE=0.01, TRAIN_ITERATION=1, DECAY=0.5), name=name + '_dqn', value_func=mlp_q) agent = Agent( env=env, env_spec=env_spec, algo=dqn, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow( test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, in_which_status='TRAIN', store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): env = make('Pendulum-v0') name = 'demo_exp' env_spec = env.env_spec mlp_dyna = ContinuousMLPGlobalDynamicsModel(env_spec=env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=0.01, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) algo = ModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=2, SAMPLED_PATH_NUM=5, dynamics_model_train_iter=10), name=name + '_mpc', policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=RandomRewardFunc(name='reward_func'), terminal_func=RandomTerminalFunc(name='random_terminal'), ) agent = Agent( env=env, env_spec=env_spec, algo=algo, name=name + '_agent', exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space, init_random_prob=0.5)) flow = create_train_test_flow(test_every_sample_count=10, train_every_sample_count=10, start_test_after_sample_count=5, start_train_after_sample_count=5, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict(sample_count=10)), sample_func_and_args=(agent.sample, (), dict(sample_count=100, env=agent.env, store_flag=True))) experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()
def task_fn(): name = 'mpc_ModifiedHalfCheetah' env = make('ModifiedHalfCheetah') env_spec = env.env_spec mlp_dyna = MBMPC_MLPDynamics(env_spec=env.env_spec, name_scope=name + '_mlp_dyna', name=name + '_mlp_dyna', learning_rate=1e-3, mlp_config=[{ "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "1", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 128, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "TANH", "B_INIT_VALUE": 0.0, "NAME": "2", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": 64, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }, { "ACT": "LINEAR", "B_INIT_VALUE": 0.0, "NAME": "OUPTUT", "L1_NORM": 0.0, "L2_NORM": 0.0, "N_UNITS": env_spec.flat_obs_dim, "TYPE": "DENSE", "W_NORMAL_STDDEV": 0.03 }]) # buffer rl_size = 500 # default 1000 random_size = 500 # default 1000 ### algo horizon = 20 dyna_epoch = 60 ### agent max_step = 500 # default 1000 # TODO: 9.22 should max_step == rl_size == random_size? batch_size = 128 rand_rl_ratio = 0.1 random_trajectory = 1 # TODO: 9.22 Is there situations when tranjectory num must != 1 on_policy_trajectory = 1 on_policy_iter = 10 num_simulated_paths = 50 # default 1000 algo = ModelBasedModelPredictiveControl( dynamics_model=mlp_dyna, env_spec=env_spec, config_or_config_dict=dict(SAMPLED_HORIZON=horizon, SAMPLED_PATH_NUM=num_simulated_paths, dynamics_model_train_iter=dyna_epoch), name=name + '_algo', policy=UniformRandomPolicy(env_spec=env_spec, name='uniform_random')) algo.set_terminal_reward_function_for_dynamics_env( reward_func=MBMPC_HalfCheetah_CostFunc(name='cost_fn'), terminal_func=MBMPC_HalfCheetah_TerminalFunc(name='terminal_fn')) agent = MB_MPC_Agent(name=name + '_agent', env=env, env_spec=env_spec, algo=algo, exploration_strategy=None, algo_saving_scheduler=None) flow = create_train_test_flow(env=env, env_spec=env_spec, rl_size=rl_size, max_step=max_step, batch_size=batch_size, random_size=random_size, rand_rl_ratio=rand_rl_ratio, train_iter=dyna_epoch, on_policy_iter=on_policy_iter, random_trajectory=random_trajectory, on_policy_trajectory=on_policy_trajectory, num_simulated_paths=num_simulated_paths, train_func_and_args=(agent.train, (), dict()), test_func_and_args=(agent.test, (), dict()), sample_func_and_args=(agent.sample, (), dict()), train_every_sample_count=None, test_every_sample_count=None, start_train_after_sample_count=None, start_test_after_sample_count=None, flow_type='MBMPC_TrainFlow') experiment = Experiment(tuner=None, env=env, agent=agent, flow=flow, name=name) experiment.run()