def test_result(): ############# # test # ############# #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") policy_model = DQNModel(4, 18) #policy_model.load_state_dict(torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pt' )) #policy_model.eval() env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4') env = atari_wrappers.wrap_deepmind(env, clip_rewards=True, frame_stack=True, pytorch_img=True) policy_model.load_model( torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pickle')) num_episodes = 5 episode = 1 score = 0 ep_score = [] done = False while (episode < num_episodes): observation = env.reset() done = False while not done: #action = agent.act(state) with torch.no_grad(): t_observation /= 255 #t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) q_value = policy_model.forward(t_observation) action = argmax(q_value) env.render() time.sleep(0.0005) next_observation, reward, done, info = env.step(action) score += reward observation = next_observation if info['ale.lives'] == 0: episode += 1 ep_score.append(score) score = 0 print("Average Score : {}".format(int(np.mean(ep_score)))) print(ep_score)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, in_channels, action_size, seed): """Initialize an Agent object. """ self.in_channels = in_channels self.action_size = action_size #self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DQNModel(in_channels, action_size) self.qnetwork_target = DQNModel(in_channels, action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_list = [] def step(self, observation, action, reward, next_observation, done,num_frames): # Save experience in replay memory self.memory.add(observation, action, reward, next_observation, done) self.t_step = num_frames # Learn every UPDATE_EVERY time steps. if self.t_step % skip_frame== 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: #experiences = self.memory.sample() self.learn() def act(self, observation, eps=0.): #Returns actions for given observation as per current policy. t_observation = torch.from_numpy(observation).double()/255 # gray standard t_observation = t_observation.unsqueeze(0).to(device) # Epsilon-greedy action selection if random.random() > eps: action_values = self.qnetwork_local.forward(t_observation) action = action_values.argmax(1).data.cpu().numpy().astype(int)[0] # note the d of argmax , if the tensor is 4d then the para of argmax should be 2 else: action = random.sample(range(self.action_size), 1)[0] return action def learn(self): observations, actions, rewards, next_observations, dones = self.memory.sample() observations = torch.from_numpy(np.array(observations) / 255).double().to(device) actions = torch.from_numpy(np.array(actions).astype(int)).int().to(device) actions = actions.view(actions.shape[0], 1) rewards = torch.from_numpy(np.array(rewards)).double().to(device) rewards = rewards.view(rewards.shape[0], 1) next_observations = torch.from_numpy(np.array(next_observations) / 255).double().to(device) dones = torch.from_numpy(np.array(dones).astype(int)).int().to(device) dones = dones.view(dones.shape[0], 1) Q_target_next = self.qnetwork_target.forward(next_observations).max(1)[0].unsqueeze(1) Q_target = rewards + gamma*(Q_target_next)*(1-dones) # if done, than the second will not be added # compute the Q_local Q_local = self.qnetwork_local.forward(observations).gather(1, actions.long()) loss = self.huber_loss(Q_local, Q_target) self.qnetwork_local.backward(Q_target,Q_local, "huber",actions) self.loss_list.append(loss.cpu().numpy()) self.qnetwork_local.step() # update target network # if self.t_step % UPDATE_FREQUENCY == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target """ self.qnetwork_target.soft_update(local_model, TAU) def huber_loss(self, input, target, beta=1, size_average=True): """ a method of defining loss which increase the robustness of computing on discrete data """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum()