def play_game(options): """Play flappy bird with pretrained dqn model weight -- model file name containing weight of dqn best -- if the model is best or not """ model = QNetwork() if options.ckpt_path is None: print ('you should give weight file name.') return print ('load previous model weight: {}'.format(options.ckpt_path)) episode, epsilon = load_checkpoint(options.ckpt_path, model) if options.cuda: model = model.cuda() algorithm = DQN(model, optim, epsilon, options) algorithm.set_eval() bird_game = game.GameState() bird_game.FPS = 480 action = [1, 0] o, r, terminal = bird_game.frame_step(action) o = preprocess(o) rpm = ReplayMemory(1, options) rpm.append(o, action, r, terminal) start = time.time() fc = 0 score = 0 while True: prev_o, a, r, o, terminal = rpm.sample(1) # q = algorithm(o).cpu().detach().numpy()[0] score = max(score, bird_game.score) action = algorithm.get_optim_action(o) o, r, terminal = bird_game.frame_step(action) o = preprocess(o) # img = Image.fromarray((o*255).astype(np.uint8)).convert(mode='L') # img.save(f'{fc}-{r}-{q.argmax()}.png') # fc += 1 if terminal or score > options.max_score*2: break rpm.append(o, action, r, terminal) ela = time.time() - start print(f'Final Score {score}, FPS{bird_game.FPS}, {ela//60}m{ela%60}s') # if __name__ == "__main__": # main()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if torch.cuda.is_available(): self.qnetwork_local = self.qnetwork_local.cuda() self.qnetwork_target = self.qnetwork_target.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64, fc3_units=None, double_q=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_q = double_q # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units, fc3_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units, fc3_units).to(device) if torch.cuda.is_available(): self.qnetwork_local.cuda() self.qnetwork_target.cuda() else: self.qnetwork_local.cpu() self.qnetwork_target.cpu() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR, weight_decay=WD) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def get_action(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): output = self.qnetwork_local.forward(state) action_values = self.qnetwork_local.forward(state) if random.random() <= eps: return np.random.choice(np.arange(self.action_size)) else: return output.argmax().item() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # double q learning argmax_a = self.qnetwork_local.forward(next_states).detach().argmax( dim=1).unsqueeze(dim=1) a_val = self.qnetwork_target.forward(next_states).detach() Q_targets_next = a_val.gather(1, argmax_a) Q_targets = rewards + GAMMA * Q_targets_next Q_expected = self.qnetwork_local.forward(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train_dqn(options): max_episode = options.max_episode flappyBird = game.GameState() print(f'FPS {flappyBird.FPS}') rpm = ReplayMemory(options.rpm_size, options) # DQN的经验回放池 model = QNetwork() if options.resume and options.ckpt_path is not None: print ('load previous model weight: {}'.format(options.ckpt_path)) episode, epsilon = load_checkpoint(options.ckpt_path, model) else: epsilon = options.init_e episode = 0 if options.cuda: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=options.lr) algorithm = DQN(model, optimizer, epsilon, options) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < options.rpm_size/4: run_episode(algorithm, flappyBird, rpm, options) print(f'observation done {len(rpm)}') # 开始训练 logname = time.strftime('%Y-%m-%d %M-%I-%S' , time.localtime()) logger = get_logger(f'log/{logname}.log') best_reward = 0 max_score = 0 begin = time.time() while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part reward, loss, score = run_episode(algorithm, flappyBird, rpm, options) algorithm.epsilon = max(algorithm.final_e, algorithm.epsilon - algorithm.e_decrement) episode += 1 max_score = max(max_score, score) if (episode)%10 == 0: logger.info('episode:[{}/{}]\tscore:{:.3f}\ttrain_reward:{:.5f}\tloss:{:.5f}'.format( episode, max_episode, score, reward, loss)) # test part if (episode)%options.evaluate_freq == 0: eval_reward, score = evaluate(flappyBird, algorithm, options) mid = time.time() elapsed = round(mid-begin) logger.info('episode:[{}/{}]\tscore:{:.3f}\tepsilon:{:.5f}\ttest_reward:{:.5f}\t{}:{}'.format( episode, max_episode, score, algorithm.epsilon, eval_reward, elapsed//60, elapsed%60)) if eval_reward > best_reward: save_path = f'ckpt/best_{score}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path ) if (episode)%1000 == 0: save_path = f'ckpt/episode_{episode}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path ) # 训练结束,保存模型 save_path = f'ckpt/final_{episode}_{score}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path) mid = time.time() elapsed = round(mid-begin) logger.info('training completed, {} episiode, {}m {}s'.format(max_episode, elapsed//60, elapsed%60)) print(f'max_score {max_score}')
class Agent(): def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, random_seed) self.qnetwork_target = QNetwork(state_size, action_size, random_seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Move to GPU if CUDA is available if train_on_gpu: self.qnetwork_local.cuda() self.qnetwork_target.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0) if train_on_gpu: state = state.cuda() self.qnetwork_local.eval() action_values = self.qnetwork_local(Variable(state, volatile=True)) self.qnetwork_local.train() max_action = np.argmax(action_values.cpu().data.numpy()) policy_s = np.ones(self.action_size) * eps / self.action_size policy_s[max_action] = 1 - eps + (eps / self.action_size) action = np.random.choice(np.arange(self.action_size), p=policy_s) return action def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----------------------- update target network ----------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)