def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 max_action = float(env.action_space.high[0]) model = QuadrotorModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) agent.restore_critic('model_dir/critic.ckpt') agent.restore_actor('model_dir/actor.ckpt') for epics in range(1, 5): evaluate_reward = run_evaluate_episode(env, agent, max_action, is_render=True) print("evaluate_reward: ", evaluate_reward)
def test_load_valid_conf(self): env = make_env('Quadrotor', task='no_collision', obs_as_dict=True, simulator_conf=self.valid_config) state = env.reset() self.assertTrue('x' in state)
def test_load_invalid_conf(self): try: env = make_env('Quadrotor', task='no_collision', simulator_conf=self.invalid_config) env.reset() except RuntimeError as e: self.assertTrue('quality' in str(e))
def test_hovering_control_task(self): env = make_env('Quadrotor', task='hovering_control') state = env.reset() act = env.action_space.sample() reset = False while not reset: state, reward, reset, info = env.step(act) act = env.action_space.sample()
def test_no_collision_task(self): env = make_env('Quadrotor', task='no_collision') state = env.reset() act = env.action_space.sample() reset = False while not reset: state, reward, reset, info = env.step(act) act = env.action_space.sample()
def test_velocity_control_task(self): env = make_env('Quadrotor', task='velocity_control') state = env.reset() reset = False step = 0 while not reset: state, reward, reset, info = env.step([1., 1., 1., 1.]) self.assertTrue('next_target_g_v_x' in info) step += 1 self.assertEqual(step, env.nt)
def test_no_collision_task(self): env = make_env('Quadrotor', task='no_collision', simulator_conf=self.valid_config) state = env.reset() init_x, init_y = state['x'], state['y'] reset = False while not reset: state, reward, reset = env.step([1., 1., 1., 1.]) msg = 'quadrotor gets wrong movement in %s direction.' self.assertEqual(state['x'], init_x, msg % 'x') self.assertEqual(state['y'], init_y, msg % 'y')
def test_velocity_control_task(self): env = make_env('Quadrotor', task='velocity_control', obs_as_dict=True, simulator_conf=self.valid_config) state = env.reset() reset = False step = 0 while not reset: state, reward, reset = env.step([1., 1., 1., 1.]) self.assertTrue('next_target_g_v_x' in state) step += 1 self.assertEqual(step, env.nt)
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="velocity_control", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_490883_reward_-20.52.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, True) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_970464_reward_467.17.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, render=True) logger.info('Evaluate reward: {}'.format(evaluate_reward))
def main(): # 创建飞行器环境 env = make_env("Quadrotor_hovering_control", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim, act_dim) model = QuadrotorModel(act_dim + 1) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1) ckpt = 'steps_700176.ckpt' # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称 agent.restore(ckpt) evaluate_reward = evaluate(env, agent) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_dim = act_dim + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 启动训练 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) # 打印评估的reward # 每评估一次,就保存一次模型,以训练的step数命名 ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
# @File : velocity.py # @Software: PyCharm import numpy as np from parl.utils import action_mapping from rlschool import make_env def evaluate_episode(env, render=False): env_reward = [] for j in range(5): env.reset() d_r = 0 while True: actuall = np.array([-1, -1, -1, -1], dtype='float32') actuall = action_mapping(actuall, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(actuall) d_r += reward if render: env.render() if done: break env_reward.append(d_r) env_reward.append(np.mean(env_reward)) return env_reward env = make_env("Quadrotor", task="velocity_control") Test_Reward = evaluate_episode(env, render=True) print(Test_Reward)
env.action_space.high[0]) next_obs, reward, done, _ = env.step(action) agent.step(obs, action, REWARD_SCALE * reward, next_obs, done) obs = next_obs total_reward += reward steps += 1 if done: break return total_reward, steps env = make_env("Quadrotor", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] set_trace() model = Model(state_size=obs_dim, action_size=act_dim) target_model = Model(state_size=obs_dim, action_size=act_dim) alg = DDPG(model, target_model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)
env_action = np.clip(env_action, -1.0, 1.0) env_action = action_mapping(env_action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(env_action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward) # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 启动训练 test_flag = 0