예제 #1
0
class VanillaDQN(acme.Actor):
    """Vanilla Deep Q-learning as implemented in the original Nature paper
    """
    
    def __init__(
            self,
            network: nn.Module,
            actions: int,
            logger: Optional = None,
            learning_rate: float = 0.00025,
            replay_start_size: int = 50000,
            replay_size: int = 1000000,
            batch_size: int = 32,
            sync_target_step: int = 10000,
            update_frequency: int = 4,
            gradient_clipping: bool = False,
            reward_clipping: bool = True,
            gamma: float = 0.99,
            epsilon_start: float = 1.0,
            epsilon_end: float = 0.1,
            epsilon_end_step: int = 1000000,
            epsilon_testing: float = 0.05,
            training: bool = True,
            device: str = 'gpu',
            seed: Optional[int] = None
    ):
        """
        Initializes a DQN agent
        
        Args:
            network: a neural network to learn the Q-function
            actions: number of actions the agent can take
            logger: a logger that has a write method which receives scalars and a timestep
            learning_rate: the learning rate for the optimizer
            replay_start_size: minimum number of samples in memory before optimization starts, is also the
                number of time steps taken before reducing epsilon
            replay_size: maximum size of the replay buffer
            batch_size: number of samples for each parameter update
            sync_target_step: number of policy updates before updating the target network parameters
            update_frequency: number of time steps between each learning step
            gradient_clipping: if True, the gradients are clipped between -1 and 1
            reward_clipping: if True, the rewards are clipped between -1 and 1
            gamma: the discount factor for the MDP
            epsilon_start: value of epsilon at start of training
            epsilon_end: value of epsilon at end of training
            epsilon_end_step: number of time steps where the epsilon is linearly decayed
            epsilon_testing: value of epsilon during testing
            training: if True the agent is training if False is testing
            device: device to be used in pytorch, either gpu` or `cpu`
            seed: the random seed
        """
        
        if seed is not None:
            torch.random.manual_seed(seed)
        
        # selecting the device to use
        self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu")
        print(f"Using {self._device}...")
        
        # creating the target network, eval doesn't do anything since we are not using dropout
        self._policy_network = network.to(self._device)
        self._target_network = deepcopy(self._policy_network).to(self._device)
        self._target_network.eval()
        
        # saving the logger
        if logger is not None:
            self._logger = logger
        
        # initializing the optimizer and saving some optimization related parameters
        self._learning_rate = learning_rate
        # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate)
        self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015)
        # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015)
        self._batch_size = batch_size
        self._sync_target_step = sync_target_step
        self._update_frequency = update_frequency
        self._gradient_clipping = gradient_clipping
        self._loss_fn = torch.nn.L1Loss(reduction="none")
        self._reward_clipping = reward_clipping
        
        # setting the action space
        self._actions = actions
        self._num_steps = 0
        
        # setting the replay buffer
        self._replay_start_size = replay_start_size
        self._replay_size = replay_size
        self._memory = ReplayMemory(size=replay_size, seed=seed)
        
        # setting the MDP parameters
        self._gamma = gamma
        
        # setting the exploration parameters
        self._epsilon_end = epsilon_end
        self._epsilon_diff = epsilon_start - epsilon_end
        self._epsilon_end_step = epsilon_end_step
        self._epsilon_testing = epsilon_testing
        self._epsilon = epsilon_start
        
        # setting the training status
        self._training = training
        
        self._timestep = None
        self._next_timestep = None
    
    def select_action(
        self,
        observation: acme.types.NestedArray,
    ) -> acme.types.NestedArray:
        
        """Selects an action according to the epsilon greedy policy
        """
        
        if self._exploration_rate <= torch.rand(1).item():
            tensor_observation = torch.tensor([observation], dtype=torch.float32, device=self._device)
            
            # the action is selected with probability 1-epsilon according to the policy network
            with torch.no_grad():
                q_values = self._policy_network(tensor_observation)
                return q_values.argmax().item()
        
        else:
            return torch.randint(high=self._actions, size=(1, )).item()
    
    def observe_first(
            self,
            timestep: dm_env.TimeStep,
    ):
        """Observes the first time step
        """
        
        self._next_timestep = timestep
    
    def observe(
            self,
            action: acme.types.NestedArray,
            next_timestep: dm_env.TimeStep,
    ):
        """Observes a time step and saves a transition if the agent is training
        """
        self._timestep = self._next_timestep
        self._next_timestep = next_timestep
        
        if self._training:
            # if the agent is training, saves the transition
            # (state, status, reward, action, next state, next status and next reward)
            transition = (self._timestep, action, self._next_timestep)
            self._memory.push(transition)
            self._num_steps += 1  # increment the number of steps the agent took
            
            # if a logger exists we also log the current epsilon
            if self._logger is not None:
                data = {'epsilon': self._epsilon, 'replay_size': len(self._memory)}
                self._logger.write(data, self._num_steps)
    
    def update(
            self,
            wait: bool = False
    ):
        """Performs a Q-learning update
        
        Args:
            wait: not used since the algorithm is single process
        """
        
        # if the number of steps taken is larger than the initial number of samples needed
        # and the number of steps is a multiple of the update frequency an update is performed
        if (self._num_steps >= self._replay_start_size) and (self._num_steps % self._update_frequency == 0):
            # samples `batch_size` samples from memory
            transitions = self._memory.sample(self._batch_size)
        else:
            return

        device = self._device
        
        curr_transitions, actions, next_transitions = list(zip(*transitions))
        
        actions = torch.tensor(actions, device=device)
        rewards = torch.tensor([x.reward for x in next_transitions], device=device)
        curr_observations = torch.stack([torch.from_numpy(x.observation) for x in curr_transitions]).float().to(device)
        next_observations = torch.stack([torch.from_numpy(x.observation) for x in next_transitions]).float().to(device)
        done_mask = torch.tensor([x.last() for x in next_transitions], device=device, dtype=torch.bool)
        
        # perform reward clipping
        if self._reward_clipping:
            rewards = rewards.clamp(-1, 1)
        
        curr_values = self._policy_network(curr_observations)
        
        # the value of the current state is the value of the action that was taken
        curr_state_values = curr_values.gather(1, actions.unsqueeze(-1)).squeeze(-1)
        
        with torch.no_grad():
            next_values = self._target_network(next_observations)

            # the value of the next state is the maximum, we have to take the first element since max
            # returns a tuple of value, index
            next_state_values = next_values.max(1)[0]

            # the value of a terminal state is 0
            next_state_values[done_mask] = 0.0

        # computes the MSE Loss, using Q-learning estimate for state value
        loss = self._loss_fn(curr_state_values, rewards + next_state_values * self._gamma)
        
        loss[loss < 1] = loss[loss < 1] ** 2
        loss = loss.mean()
        
        # resets the gradients computed in the optimizer and does backpropagation
        self._optimizer.zero_grad()
        loss.backward()
        
        # performs gradient clipping
        if self._gradient_clipping:
            for param in self._policy_network.parameters():
                param.grad.data.clamp_(-1, 1)
        
        # updates the parameters
        self._optimizer.step()
        
        # periodically update the network
        if (self._num_steps // self._update_frequency) % self._sync_target_step == 0:
            model_parameters = self._policy_network.state_dict()
            # noinspection PyTypeChecker
            self._target_network.load_state_dict(model_parameters)
        
        if self._logger is not None:
            data = {'loss': loss}
            self._logger.write(data, self._num_steps)
    
    @property
    def _exploration_rate(self):
        """Exploration rate (epsilon) which decays linearly during training
        """
        if self._training:
            time_diff = (self._epsilon_end_step - max(0, self._num_steps - self._replay_start_size))
            
            epsilon = self._epsilon_end + max(0., (self._epsilon_diff * time_diff) / self._epsilon_end_step)
        else:
            epsilon = self._epsilon_testing
        
        self._epsilon = epsilon
        return epsilon
    
    def training(self):
        """Changes the agent mode to train
        """
        self._training = True
        
    def testing(self):
        """Changes the agent mode to test
        """
        self._training = False
예제 #2
0
class Brain:
    def __init__(self, cfg, tetris):
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.gamma = cfg.SOLVER.GAMMA
        self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE

        transition = namedtuple('Transicion',
                                ('state', 'action', 'next_state', 'reward'))
        self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition)
        self.model = get_model(cfg)

        self.target_net = copy.deepcopy(self.model)
        self.target_net.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.tetris = tetris

    def replay(self):

        if len(self.memory) < self.BATCH_SIZE:
            return

        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        self.model.eval()

        state_action_values = self.model(state_batch).gather(1, action_batch)

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        next_state_values = torch.zeros(BATCH_SIZE)

        self.target_net.eval()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.model.train()

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_model(self):
        self.target_net.load_state_dict(self.model.state_dict())

    def decide_action(self, state, mino, episode):
        epsilon = 0.41 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.model.eval()
            with torch.no_grad():
                action = self.tetris.get_masked_action(self.model(state), mino)
        else:
            action = torch.LongTensor(
                [[self.tetris.get_random_masked_action(mino)]])

        return action

    def brain_predict(self, state):
        self.model.eval()
        with torch.no_grad():
            action = self.model(state).max(1)[1].view(1, 1)
        return action