def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 max_action = float(env.action_space.high[0]) model = QuadrotorModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) agent.restore_critic('model_dir/critic.ckpt') agent.restore_actor('model_dir/actor.ckpt') for epics in range(1, 5): evaluate_reward = run_evaluate_episode(env, agent, max_action, is_render=True) print("evaluate_reward: ", evaluate_reward)
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="velocity_control", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_490883_reward_-20.52.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, True) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_970464_reward_467.17.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, render=True) logger.info('Evaluate reward: {}'.format(evaluate_reward))
def main(): # 创建飞行器环境 env = make_env("Quadrotor_hovering_control", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim, act_dim) model = QuadrotorModel(act_dim + 1) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1) ckpt = 'steps_700176.ckpt' # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称 agent.restore(ckpt) evaluate_reward = evaluate(env, agent) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def main(): # 创建飞行器环境 env = make_env("Quadrotor_hovering_control", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim, act_dim) model = QuadrotorModel(act_dim + 1) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1) # if os.path.exists('model_dir/steps_140848.ckpt'): # agent.restore('model_dir/steps_140848.ckpt') # print("Restore succeed") # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim + 1) # 启动训练 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Train reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) # 打印评估的reward # 每评估一次,就保存一次模型,以训练的step数命名 ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
env.render() eval_reward.append(total_reward) return np.mean(eval_reward) if __name__ == "__main__": # 创建飞行器环境 # env = make_env("Quadrotor", task="velocity_control", seed=0) env = Quadrotor(task="velocity_control", seed=0) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 根据parl框架构建agent model = QuadrotorModel(act_dim=act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm=algorithm, obs_dim=obs_dim + 3, act_dim=act_dim) # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim + 3, act_dim) best_test_reward = -10000 # agent.restore('model_dir/best.ckpt')
# 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 #CHANGE max_action = float(env.action_space.high[0]) print("max action: ", max_action) #model = QuadrotorModel(act_dim) #algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) #CHANGE model = QuadrotorModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 启动训练 test_flag = 0 total_steps = 0 best_reward = -float('inf') while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm)