class VanillaDQN(acme.Actor): """Vanilla Deep Q-learning as implemented in the original Nature paper """ def __init__( self, network: nn.Module, actions: int, logger: Optional = None, learning_rate: float = 0.00025, replay_start_size: int = 50000, replay_size: int = 1000000, batch_size: int = 32, sync_target_step: int = 10000, update_frequency: int = 4, gradient_clipping: bool = False, reward_clipping: bool = True, gamma: float = 0.99, epsilon_start: float = 1.0, epsilon_end: float = 0.1, epsilon_end_step: int = 1000000, epsilon_testing: float = 0.05, training: bool = True, device: str = 'gpu', seed: Optional[int] = None ): """ Initializes a DQN agent Args: network: a neural network to learn the Q-function actions: number of actions the agent can take logger: a logger that has a write method which receives scalars and a timestep learning_rate: the learning rate for the optimizer replay_start_size: minimum number of samples in memory before optimization starts, is also the number of time steps taken before reducing epsilon replay_size: maximum size of the replay buffer batch_size: number of samples for each parameter update sync_target_step: number of policy updates before updating the target network parameters update_frequency: number of time steps between each learning step gradient_clipping: if True, the gradients are clipped between -1 and 1 reward_clipping: if True, the rewards are clipped between -1 and 1 gamma: the discount factor for the MDP epsilon_start: value of epsilon at start of training epsilon_end: value of epsilon at end of training epsilon_end_step: number of time steps where the epsilon is linearly decayed epsilon_testing: value of epsilon during testing training: if True the agent is training if False is testing device: device to be used in pytorch, either gpu` or `cpu` seed: the random seed """ if seed is not None: torch.random.manual_seed(seed) # selecting the device to use self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu") print(f"Using {self._device}...") # creating the target network, eval doesn't do anything since we are not using dropout self._policy_network = network.to(self._device) self._target_network = deepcopy(self._policy_network).to(self._device) self._target_network.eval() # saving the logger if logger is not None: self._logger = logger # initializing the optimizer and saving some optimization related parameters self._learning_rate = learning_rate # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate) self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015) # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015) self._batch_size = batch_size self._sync_target_step = sync_target_step self._update_frequency = update_frequency self._gradient_clipping = gradient_clipping self._loss_fn = torch.nn.L1Loss(reduction="none") self._reward_clipping = reward_clipping # setting the action space self._actions = actions self._num_steps = 0 # setting the replay buffer self._replay_start_size = replay_start_size self._replay_size = replay_size self._memory = ReplayMemory(size=replay_size, seed=seed) # setting the MDP parameters self._gamma = gamma # setting the exploration parameters self._epsilon_end = epsilon_end self._epsilon_diff = epsilon_start - epsilon_end self._epsilon_end_step = epsilon_end_step self._epsilon_testing = epsilon_testing self._epsilon = epsilon_start # setting the training status self._training = training self._timestep = None self._next_timestep = None def select_action( self, observation: acme.types.NestedArray, ) -> acme.types.NestedArray: """Selects an action according to the epsilon greedy policy """ if self._exploration_rate <= torch.rand(1).item(): tensor_observation = torch.tensor([observation], dtype=torch.float32, device=self._device) # the action is selected with probability 1-epsilon according to the policy network with torch.no_grad(): q_values = self._policy_network(tensor_observation) return q_values.argmax().item() else: return torch.randint(high=self._actions, size=(1, )).item() def observe_first( self, timestep: dm_env.TimeStep, ): """Observes the first time step """ self._next_timestep = timestep def observe( self, action: acme.types.NestedArray, next_timestep: dm_env.TimeStep, ): """Observes a time step and saves a transition if the agent is training """ self._timestep = self._next_timestep self._next_timestep = next_timestep if self._training: # if the agent is training, saves the transition # (state, status, reward, action, next state, next status and next reward) transition = (self._timestep, action, self._next_timestep) self._memory.push(transition) self._num_steps += 1 # increment the number of steps the agent took # if a logger exists we also log the current epsilon if self._logger is not None: data = {'epsilon': self._epsilon, 'replay_size': len(self._memory)} self._logger.write(data, self._num_steps) def update( self, wait: bool = False ): """Performs a Q-learning update Args: wait: not used since the algorithm is single process """ # if the number of steps taken is larger than the initial number of samples needed # and the number of steps is a multiple of the update frequency an update is performed if (self._num_steps >= self._replay_start_size) and (self._num_steps % self._update_frequency == 0): # samples `batch_size` samples from memory transitions = self._memory.sample(self._batch_size) else: return device = self._device curr_transitions, actions, next_transitions = list(zip(*transitions)) actions = torch.tensor(actions, device=device) rewards = torch.tensor([x.reward for x in next_transitions], device=device) curr_observations = torch.stack([torch.from_numpy(x.observation) for x in curr_transitions]).float().to(device) next_observations = torch.stack([torch.from_numpy(x.observation) for x in next_transitions]).float().to(device) done_mask = torch.tensor([x.last() for x in next_transitions], device=device, dtype=torch.bool) # perform reward clipping if self._reward_clipping: rewards = rewards.clamp(-1, 1) curr_values = self._policy_network(curr_observations) # the value of the current state is the value of the action that was taken curr_state_values = curr_values.gather(1, actions.unsqueeze(-1)).squeeze(-1) with torch.no_grad(): next_values = self._target_network(next_observations) # the value of the next state is the maximum, we have to take the first element since max # returns a tuple of value, index next_state_values = next_values.max(1)[0] # the value of a terminal state is 0 next_state_values[done_mask] = 0.0 # computes the MSE Loss, using Q-learning estimate for state value loss = self._loss_fn(curr_state_values, rewards + next_state_values * self._gamma) loss[loss < 1] = loss[loss < 1] ** 2 loss = loss.mean() # resets the gradients computed in the optimizer and does backpropagation self._optimizer.zero_grad() loss.backward() # performs gradient clipping if self._gradient_clipping: for param in self._policy_network.parameters(): param.grad.data.clamp_(-1, 1) # updates the parameters self._optimizer.step() # periodically update the network if (self._num_steps // self._update_frequency) % self._sync_target_step == 0: model_parameters = self._policy_network.state_dict() # noinspection PyTypeChecker self._target_network.load_state_dict(model_parameters) if self._logger is not None: data = {'loss': loss} self._logger.write(data, self._num_steps) @property def _exploration_rate(self): """Exploration rate (epsilon) which decays linearly during training """ if self._training: time_diff = (self._epsilon_end_step - max(0, self._num_steps - self._replay_start_size)) epsilon = self._epsilon_end + max(0., (self._epsilon_diff * time_diff) / self._epsilon_end_step) else: epsilon = self._epsilon_testing self._epsilon = epsilon return epsilon def training(self): """Changes the agent mode to train """ self._training = True def testing(self): """Changes the agent mode to test """ self._training = False
class Brain: def __init__(self, cfg, tetris): self.num_actions = cfg.MODEL.SIZE_ACTION self.gamma = cfg.SOLVER.GAMMA self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE transition = namedtuple('Transicion', ('state', 'action', 'next_state', 'reward')) self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition) self.model = get_model(cfg) self.target_net = copy.deepcopy(self.model) self.target_net.load_state_dict(self.model.state_dict()) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) self.tetris = tetris def replay(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) self.model.eval() state_action_values = self.model(state_batch).gather(1, action_batch) non_final_mask = torch.ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) next_state_values = torch.zeros(BATCH_SIZE) self.target_net.eval() next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch self.model.train() loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_model(self): self.target_net.load_state_dict(self.model.state_dict()) def decide_action(self, state, mino, episode): epsilon = 0.41 * (1 / (episode + 1)) if epsilon <= np.random.uniform(0, 1): self.model.eval() with torch.no_grad(): action = self.tetris.get_masked_action(self.model(state), mino) else: action = torch.LongTensor( [[self.tetris.get_random_masked_action(mino)]]) return action def brain_predict(self, state): self.model.eval() with torch.no_grad(): action = self.model(state).max(1)[1].view(1, 1) return action