def main(): render_bool = True if not render_bool: os.environ["SDL_VIDEODRIVER"] = "dummy" # else: # pygame.display.set_mode((800, 600 + 60)) # 创建环境 game = GameEnv() p = PLE(game, display_screen=render_bool, fps=60, force_fps=False ) # , fps=30, display_screen=render_bool, force_fps=True) p.init() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) width, height = p.getScreenDims() rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 obs_dim = 1, width, height model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.5, e_greed_decrement=0.00001) # e_greed有一定概率随机选取动作,探索 # 加载模型 best_eval_reward = -1000 if os.path.exists('./model_dqn.ckpt'): print("loaded model:", './model_dqn.ckpt') agent.restore('./model_dqn.ckpt') best_eval_reward = evaluate(p, agent, render=render_bool) # run_episode(env, agent, train_or_test='test', render=True) # exit() # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 5): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=render_bool) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 保存模型到文件 ./model.ckpt agent.save('./model_dqn_%d.ckpt' % rate_num) if best_eval_reward < eval_reward: best_eval_reward = eval_reward agent.save('./model_dqn.ckpt')
def main(): env = Paddle() action_dims = 3 obs_dims = 5 rpm = ReplayMemory(MEMORY_SIZE) model = PadModel(action_dims) algorithm = DQN(model, action_dims, GAMMA, LEARNING_RATE) agent = PadAgent(algorithm, obs_dim=obs_dims, act_dim=action_dims) # use this to restore your model # agent.restore('./dqn_model.ckpt') while len(rpm) < MEMORY_WARMUP_SIZE: run_eposide(agent, env, rpm) max_eposide = 1000 for eposide in range(1, max_eposide + 1): total_reward = run_eposide(agent, env, rpm) print(total_reward) if eposide % 50 == 0: eval_reward = evaluate(agent, env) logger.info('eposide:{},test_reward:{}'.format( eposide, eval_reward)) save_path = './dqn_model.ckpt' agent.save(save_path)
def train(): env = gym.make(ENV_NAME) env.seed(0) np.random.seed(0) action_dim = env.action_space.n obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim) while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) episode = 0 rewards = [] for episode in range(EPISODES): total_reward = run_episode(env, agent, rpm) episode += 1 rewards.append(total_reward) average_reward = np.mean(rewards[-100:]) logger.info(f'episode:{episode}\t e_greed:{agent.epsilon:6f}\t reward:{total_reward:.2f} \t' f'Avg reward: {average_reward:.2f}\n') if (episode+1) % SAVE_FREQ: agent.save(os.path.join(SAVE_PATH, f'{RUN_TAG}_{episode+1}.ckpt')) agent.save(os.path.join(SAVE_PATH, f'{RUN_TAG}_final.ckpt'))
def main(): env = get_player(args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) test_env = get_player(args.rom, image_size=IMAGE_SIZE, frame_skip=FRAME_SKIP, context_len=CONTEXT_LEN) rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) act_dim = env.action_space.n device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = AtariModel(CONTEXT_LEN, act_dim, args.algo) if args.algo in ['DQN', 'Dueling']: algorithm = DQN(model, gamma=GAMMA, lr=args.lr) elif args.algo == 'Double': algorithm = DDQN(model, gamma=GAMMA, lr=args.lr) agent = AtariAgent(algorithm, act_dim=act_dim) with tqdm(total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while rpm.size() < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) # Get fixed obs to check value function. fixed_obs = get_fixed_obs(rpm, args.batch_size) fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device) # train test_flag = 0 total_steps = 0 with tqdm(total=args.train_total_steps, desc='[Training Model]') as pbar: while total_steps < args.train_total_steps: total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_rewards = [] for _ in range(3): eval_rewards.append(run_evaluate_episode(test_env, agent)) summary.add_scalar('dqn/eval', np.mean(eval_rewards), total_steps) summary.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/loss', loss, total_steps) summary.add_scalar('dqn/exploration', agent.exploration, total_steps) summary.add_scalar('dqn/Q value', evaluate_fixed_Q(agent, fixed_obs), total_steps) summary.add_scalar('dqn/grad_norm', get_grad_norm(agent.alg.model), total_steps)
def main(): env = gym.make('LunarLander-v2') action_dim = env.action_space.n obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.001, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 model_dir = './modeldir' model_name = '/LunarLander.ckpt' baseEpisode = 2300 if os.path.exists(model_name): agent.restore(model_name) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 3000 # 开始训练 episode = 0 last_eval_reward = 255.0 # 每训练50个episode,测试10个episode while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) logger.info('Train: episode: {}, train_reward:{}'.format( episode + baseEpisode, total_reward)) episode += 1 # test part eval_reward = evaluate(env, agent, 10, render=True) # render=True 查看显示效果 logger.info( 'Evaluate: episode:{} e_greed:{} test_reward:{}'.format( episode + baseEpisode, agent.e_greed, eval_reward)) if eval_reward > last_eval_reward: last_eval_reward = eval_reward agent.save(model_dir + '/LunarLander_dqn_{}.ckpt'.format(episode + baseEpisode)) # 训练结束,保存模型 agent.save(model_dir + '/LunarLander_dqn.ckpt')
def init_environment(): env = BirdEnv() action_dim = 2 hyperparas = { 'action_dim': action_dim, 'gamma': GAMMA, 'lr': LEARNING_RATE } model = BirdModel(action_dim) algorithm = DQN(model, action_dim, GAMMA, LEARNING_RATE) agent = BirdAgent(algorithm, action_dim) return env,agent
def train(): # 创建环境 game = FlappyBird() env_1 = PLE(game, fps=30, display_screen=False) env_2 = PLE(game, fps=30, display_screen=True) obs_dim = len(env_1.getGameState()) act_dim = len(env_1.getActionSet()) print('action set:', env_1.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './flappybird.ckpt' if os.path.exists(save_path): agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env_1, agent, rpm) max_episode = 2000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train for i in range(0, 100): total_reward, steps = run_episode(env_1, agent, rpm) episode += 1 # test eval_reward, steps = evaluate(env_2, agent) logger.info( '[episode:{}], e_greed:{:.6f}, steps:{}, test_reward:{}'.format( episode, agent.e_greed, steps, eval_reward)) # 保存模型 ckpt = './models/episode_{}.ckpt'.format(episode) agent.save(ckpt) # 训练结束,保存模型 save_path = './flappybird.ckpt' agent.save(save_path)
def eval(): env = Paddle() action_dims = 3 obs_dims = 5 rpm = ReplayMemory(MEMORY_SIZE) model = PadModel(action_dims) algorithm = DQN(model, action_dims, GAMMA, LEARNING_RATE) agent = PadAgent(algorithm, obs_dim=obs_dims, act_dim=action_dims) # use this to test the model you want agent.restore('./dqn_model.ckpt') eval_reward = evaluate(agent, env) logger.info('test_reward:{}'.format(eval_reward))
def main(): # 创建环境 env = gym.make('MountainCar-v0') action_dim = env.action_space.n # MountainCar-v0: 3 obs_shape = env.observation_space.shape # MountainCar-v0: (2,) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm,obs_dim=obs_shape[0],act_dim=action_dim,e_greed=0.1, e_greed_decrement=1e-6) ckpt = 'model.ckpt' agent.restore(ckpt) evaluate_reward = evaluate(env,agent,render=True)
def main(): env = JumpGame() #action_dim = env.action_space.n # CartPole-v0: 2 #obs_shape = env.observation_space.shape # CartPole-v0: (4,) action_dim = 2 # CartPole-v0: 2 #obs_shape = len(env.obs) # CartPole-v0: (4,) obs = env.reset() #obs_shape = env.obs_size # CartPole-v0: (4,) obs_shape = len(obs) # CartPole-v0: (4,) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 20000 # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 训练结束,保存模型 save_path = './dqn_model_jump.ckpt' agent.save(save_path)
def main(): env = PLE(Pixelcopter(), fps=30, display_screen=True, state_preprocessor=None) action_dim = len(env.getActionSet()) obs_shape = len(env.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 30000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward, max_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info( 'episode:{} e_greed:{} test_reward:{} max_reward:{}'.format( episode, agent.e_greed, eval_reward, max_reward)) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def test(): # 创建环境 game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) act_dim = len(env.getActionSet()) print('action set:', env.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './DQN/checkpoints/episode_V14600.ckpt' print('checkpoints:', save_path) if os.path.exists(save_path): logger.info('load ckpt success!') agent.restore(save_path) else: logger.error('load ckpt error!') action_set = env.getActionSet() env.init() episode_reward = 0 steps = 0 while not env.game_over(): steps += 1 if (steps == 1): continue obs = list(env.getGameState().values()) action_idx = agent.predict(obs) # 预测动作,只选最优动作 act = action_set[action_idx] reward = env.act(act) episode_reward += reward reward_str = str(int(episode_reward)) drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0), (255, 255, 255)) env.reset_game() logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
def play(): env = gym.make(ENV_NAME) action_dim = env.action_space.n obs_shape = env.observation_space.shape model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim) ckpt_path = './dqn_model.ckpt' agent.restore(ckpt_path) eval_reward = evaluate(env, agent, render=True) # render=True 查看显示效果 logger.info(f'Avg test reward:{eval_reward:.2f}')
def load_settings(self): self._obs_dim = obs_dim(self._mansion_attr) self._act_dim = act_dim(self._mansion_attr) self._global_step = 0 self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1) self._model = RLDispatcherModel(self._act_dim) hyperparas = { 'action_dim': self._act_dim, 'lr': 5.0e-4, 'gamma': 0.998 } #print ("action dimention:", self._obs_dim, self._act_dim) self._algorithm = DQN(self._model, hyperparas) self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim) self._warm_up_size = 2000 self._statistic_freq = 1000 self._loss_queue = deque()
def main(): # 创建环境 game = Snake(width=200, height=200, init_length=5) p = PLE(game, fps=30, display_screen=False, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) obs_dim = 200 * 200 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.2) # e_greed有一定概率随机选取动作,探索 # 加载模型 # if os.path.exists('./dqn_snake_400.ckpt'): # agent.restore('./dqn_snake_400.ckpt') # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 2000000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 100): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward > best_reward: best_reward = eval_reward agent.save('model_dir/dqn_snake_{}.ckpt'.format(episode)) logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward))
def main(): # 创建环境 game = Snake(width=224, height=224, init_length=7) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) #rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.2) # e_greed有一定概率随机选取动作,探索 # 加载模型 if os.path.exists('./dqn_snake_7.ckpt'): agent.restore('./dqn_snake_7.ckpt') evaluate(p, agent, False)
def main(): env = gym.make('MountainCar-v0') # expect reward > -110? action_dim = env.action_space.n # MountainCar-v0: 3 obs_shape = env.observation_space.shape # MountainCar-v0: (2,) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = choose_model(args.model, act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=args.lr) agent = Agent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.5, # 有一定概率随机选取动作,探索 e_greed_decrement=4e-5) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 1500 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 训练结束,保存模型 save_path = 'saved_model/dqn_model_%s_%s.ckpt' % (args.model, args.lr) agent.save(save_path)
BATCH_SIZE = 64 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 LEARNING_RATE = 0.001 # 学习率 GAMMA = 0.9 # reward 的衰减因子,一般取 0.9 到 0.999 不等 gpu = fluid.CUDAPlace(0) fluid.Executor(gpu) env = PaddleEnv() action_dim = 3 #动作 一共有三种 obs_shape = [5] #观察量五种 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.05, # 有一定概率随机选取动作,探索 e_greed_decrement=10e-7) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 save_path = './Model/dqn_model.ckpt' agent.restore(save_path) while True: # 训练max_episode个回合,test部分不计算入episode数量 obs = env.reset() episode_reward = 0 while True:
if render: env.getScreenRGB() if env.game_over(): break eval_reward.append(episode_reward) return np.mean(eval_reward) env = Catcher(500, 500) env = PLE(env, fps=10, display_screen=True, force_fps=False) act_dim = len(env.getActionSet()) obs_dim = len(env.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=0.1, e_greed=1e-6) """ #添加经验池 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 2000 # start train episode = 0
def main(): env = gym.make('FlappyBird-v0') test_env = Monitor(env, directory='test', video_callable=lambda x: True, force=True) rpm = ReplayMemory(MEMORY_SIZE) act_dim = env.action_space.n model = Model(act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, act_dim=act_dim, e_greed=E_GREED, e_greed_decrement=E_GREED_DECREMENT) # 加载模型 #save_path = './dqn_model.ckpt' #agent.restore(save_path) with tqdm(total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while len(rpm) < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) # train best_reward = -5 pbar = tqdm(total=train_total_steps) test_flag = 0 total_steps = 0 while total_steps < train_total_steps: # start epoch total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.e_greed)) summary.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss summary.add_scalar('dqn/exploration', agent.e_greed, total_steps) pbar.update(steps) if total_steps // test_every_steps >= test_flag: while total_steps // test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") eval_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) eval_rewards.append(eval_reward) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, np.mean(eval_rewards))) eval_test = np.mean(eval_rewards) summary.add_scalar('dqn/eval', eval_test, total_steps) if eval_test > best_reward: agent.save('./best_dqn_model.ckpt') best_reward = eval_test pbar.close() # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def main(): ''' env = gym.make('MountainCar-v0') # MountainCar-v0:expected reward > -120 action_dim = env.action_space.n # CartPole-v0: 2 obs_shape = env.observation_space.shape # CartPole-v0: (4,) ''' game = FlappyBird() env = PLE(game, fps=30, display_screen=True,state_preprocessor=None) #agent = myAgentHere(allowed_actions=p.getActionSet()) ''' p.init() reward = 0.0 nb_frames=1000 for i in range(nb_frames): if p.game_over(): p.reset_game() observation = p.getScreenRGB() #action = agent.pickAction(reward, observation) action=0 reward = p.act(action) ''' action_dim = len(env.getActionSet()) # CartPole-v0: 2 obs_shape = len(env.getGameState()) # CartPole-v0: (4,) #obs_shape=10 #print(obs_shape) #print("??????????") rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 200000 # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def main(): env = P() #action_dim = env.action_space.n # CartPole-v0: 2 action_dim = 3 obs = env.reset() print(obs) obs_shape = len(obs) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 5000 test_reward = [] score = 30 best_episode = 0 # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part # print(episode) total_reward = run_episode(env, agent, rpm) if episode % 10 == 0: logger.info('episode:{} e_greed:{} train_reward:{}'.format( episode, agent.e_greed, total_reward)) # test part if episode % 100 == 0: eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) test_reward.append(eval_reward) if eval_reward >= score: score = eval_reward best_episode = episode agent.save('./model_paddle_best.ckpt') print('save model_best') print('model_best:', best_episode, score) episode += 1 # 训练结束,保存模型 save_path = './paddle_model.ckpt' agent.save(save_path)