def __init__(self, obs_dim, act_dim, max_action, gamma, tau, actor_lr, critic_lr): model = Model(act_dim, max_action) self.alg = DDPG(model, gamma, tau, actor_lr, critic_lr) self.obs_dim = obs_dim self.act_dim = act_dim super(Agent, self).__init__(self.alg) # 注意:最开始先同步self.model和self.target_model的参数. self.alg.sync_target(decay=0)
def main(): # 创建BipedalWalker环境 env = gym.make("BipedalWalkerHardcore-v3") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = BipedalWalkerModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) if os.path.exists('model_dir5/steps_10075.ckpt'): agent.restore('model_dir5/steps_10075.ckpt') print("restore succeed") # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) test_flag = 0 total_steps = 0 best_reward = -float('inf') while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // TEST_EVERY_STEPS >= test_flag: while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) if evaluate_reward >= best_reward: best_reward = evaluate_reward # 保存模型 ckpt = 'model_dir5/steps_{}_reward_{}.ckpt'.format( total_steps, round(best_reward, 2)) agent.save(ckpt) # 保存模型 ckpt = 'model_dir5/steps_{}_reward_{}.ckpt'.format(total_steps, evaluate_reward) agent.save(ckpt)
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="velocity_control", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_490883_reward_-20.52.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, True) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="no_collision", seed=1) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) ckpt = 'steps_970464_reward_467.17.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env, agent, render=True) logger.info('Evaluate reward: {}'.format(evaluate_reward))
def main(): # 创建环境 env = Paddle() np.random.seed(0) act_dim = 3 obs_dim = 5 # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 加载模型 if os.path.exists('./model.ckpt'): save_path = './model.ckpt' agent.restore(save_path) print("模型加载成功") # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm) episode = 0 while episode < TRAIN_EPISODE: for i in range(50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(env, agent, render=False) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) save_path = './model/dqn_model_{}_{}.ckpt'.format(i, total_reward) agent.save(save_path) # 保存模型到文件 ./model.ckpt agent.save('./model.ckpt')
def main(): # 创建环境 traci.start([sumoBinary, "-c", "data/cross.sumocfg", "--tripinfo-output", "tripinfo.xml"]) traci.trafficlight.setPhase("0", 0) act_dim = 2 obs_dim = 1440 # (10, 24, 6) # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 加载模型 if os.path.exists('./model.ckpt'): save_path = './model.ckpt' agent.restore(save_path) print("模型加载成功") env = 0 # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: save_data(agent, env, rpm) episode = 0 while episode < TRAIN_EPISODE: for i in range(50): total_reward, steps = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(env, agent, render=False) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) save_path = './model/model_{}_{}.ckpt'.format(i, total_reward) agent.save(save_path) # 保存模型到文件 ./model.ckpt agent.save('./model.ckpt')
def main(): # 创建BipedalWalker环境 env = gym.make("BipedalWalker-v3") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = BipedalWalkerModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) ckpt = 'model.ckpt' agent.restore(ckpt) evaluate(env, agent, False)
def main(): # 创建飞行器环境 env = make_env("Quadrotor_hovering_control", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim, act_dim) model = QuadrotorModel(act_dim + 1) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1) ckpt = 'steps_700176.ckpt' # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称 agent.restore(ckpt) evaluate_reward = evaluate(env, agent) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward
def stock_trade(stock_file, mode_path): day_profits = [] df = pd.read_csv(stock_file) df = df.sort_values('date') # The algorithms require a vectorized environment to run env = StockTradingEnv(df) act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = StockModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = StockAgent(algorithm, obs_dim, act_dim) df_test = pd.read_csv(stock_file.replace('train', 'test')) # 加载模型 if os.path.exists(mode_path): agent.restore(mode_path) env2 = StockTradingEnv(df_test) obs = env2.reset() for i in range(len(df_test) - 1): batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action = np.random.normal(action, 1.0) action = np.clip(action, -1.0, 1.0) ## special action = action_mapping(action, env2.action_space.low[0], env2.action_space.high[0]) next_obs, reward, done, info = env2.step(action) obs = next_obs profit = env2.render() day_profits.append(profit) if done: break return day_profits
def main(): # 创建BipedalWalker环境 env = gym.make("BipedalWalker-v3") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = BipedalWalkerModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) ckpt = 'model.ckpt' agent.restore(ckpt) print("restore succeed") return best_reward = -float('inf') for i in range(5): evaluate_reward, steps = evaluate(env, agent,render=False) if evaluate_reward>best_reward: best_reward = evaluate_reward logger.info('Episode:{}, Evaluate reward: {}'.format(i, evaluate_reward)) print("best_reward:",best_reward)
def main(): # 创建飞行器环境 env = make_env("Quadrotor", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_dim = act_dim + 1 model = QuadrotorModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 启动训练 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) # 打印评估的reward # 每评估一次,就保存一次模型,以训练的step数命名 ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
def main(): # 创建环境 game = Pong(width=200, height=200, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) print("act_dim:", act_dim) obs_dim = 200 * 200 # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = PongModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = PongAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) max_episode = 20000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward > best_reward: best_reward = eval_reward agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode)) logger.info('episode:{} test_reward:{}'.format(episode, eval_reward))
obs_dim = env.observation_space.shape[0] # 创建经验池 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) # 根据parl框架构建agent ###################################################################### ###################################################################### # # 4. 请参考课堂Demo,嵌套Model, DQN, Agent构建 agent # ###################################################################### ###################################################################### model = QuadrotorModel(act_dim=act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 max_episode = 500 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm)
break total_reward.append(c_r) env_reward.append(d_r) total_reward.append(np.mean(total_reward)) env_reward.append(np.mean(env_reward)) return total_reward, env_reward env = make_env("Quadrotor", task="velocity_control") obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] logger.info('obs_dim:{} , act_dim:{}'.format(obs_dim, act_dim)) model = QModel(act_dim) alg = DDPG(model, gamma=Gamma, tau=Tau, actor_lr=A_lr, critic_lr=C_lr) agent = QAgent(alg, obs_dim, act_dim) rpm = ReplayMemory(int(Max_Size), obs_dim, act_dim) # loadpath = 'M1_-1027_Over.ckpt' # if os.path.exists(loadpath): # agent.restore(loadpath) # logger.info('Already Loaded') # else: # exit(1) test_flag = 0 total_step = 0 reward_max = -1027 while True:
class Agent(parl.Agent): def __init__(self, obs_dim, act_dim, max_action, gamma, tau, actor_lr, critic_lr): model = Model(act_dim, max_action) self.alg = DDPG(model, gamma, tau, actor_lr, critic_lr) self.obs_dim = obs_dim self.act_dim = act_dim super(Agent, self).__init__(self.alg) # 注意:最开始先同步self.model和self.target_model的参数. self.alg.sync_target(decay=0) def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=[self.obs_dim], dtype='float32') act = layers.data(name='act', shape=[self.act_dim], dtype='float32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data(name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') self.actor_cost, self.critic_cost = self.alg.learn( obs, act, reward, next_obs, terminal) def predict(self, obs): obs = np.expand_dims(obs, axis=0) act = self.fluid_executor.run(self.pred_program, feed={'obs': obs}, fetch_list=[self.pred_act])[0] act = np.squeeze(act) return act def learn(self, obs, act, reward, next_obs, terminal): feed = { 'obs': obs, 'act': act, 'reward': reward, 'next_obs': next_obs, 'terminal': terminal } [critic_cost, actor_cost] = self.fluid_executor.run( self.learn_program, feed=feed, fetch_list=[self.critic_cost, self.actor_cost]) self.alg.sync_target() return critic_cost[0], actor_cost[0] def save_model4visual(self, dir, name): feed_vars = ['obs', 'act', 'reward', 'next_obs', 'terminal'] target_vars = [self.actor_cost, self.critic_cost] executor = self.fluid_executor program = self.learn_program fluid.io.save_inference_model(dir, feed_vars, target_vars, executor, program, program_only=True)
def main(train=True): if train: # 加载数据 df = pd.read_csv('wudigushi/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] print(act_dim) print(obs_dim) model = WudigupiaoModel(act_dim) algorithm = DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MyStockAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim , act_dim) test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // TEST_EVERY_STEPS >= test_flag: # print('s1') while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format(total_steps, evaluate_reward)) ckpt = 'wudigushi/ckpt/steps_{}.ckpt'.format(total_steps) agent.save(ckpt) else: ckpt = 'wudigushi/ckpt/steps_980117.ckpt' # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称 df = pd.read_csv('wudigushi/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] # obs_dim = 36 print(act_dim) print(obs_dim) model = WudigupiaoModel(act_dim) algorithm = DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MyStockAgent(algorithm, obs_dim, act_dim) agent.restore(ckpt) evaluate_reward = evaluate(env, agent) logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward