Python ReplayMemory.sample примеры использования

Язык программирования: Python

Пространство имен/Пакет: dqn.memory

Класс/Тип: ReplayMemory

Метод/Функция: sample

Примеров на hotexamples.com: 2

Python ReplayMemory.sample - 2 примера найдено. Это лучшие примеры Python кода для dqn.memory.ReplayMemory.sample, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(5)

sample(2)

getRecentState(1)

push(1)

sampleMinibatch(1)

storeRecentState(1)

storeTransition(1)

Пример #1

Показать файл

Файл: nature_dqn.py Проект: ruirmv/learning-to-c

class VanillaDQN(acme.Actor):
    """Vanilla Deep Q-learning as implemented in the original Nature paper
    """
    
    def __init__(
            self,
            network: nn.Module,
            actions: int,
            logger: Optional = None,
            learning_rate: float = 0.00025,
            replay_start_size: int = 50000,
            replay_size: int = 1000000,
            batch_size: int = 32,
            sync_target_step: int = 10000,
            update_frequency: int = 4,
            gradient_clipping: bool = False,
            reward_clipping: bool = True,
            gamma: float = 0.99,
            epsilon_start: float = 1.0,
            epsilon_end: float = 0.1,
            epsilon_end_step: int = 1000000,
            epsilon_testing: float = 0.05,
            training: bool = True,
            device: str = 'gpu',
            seed: Optional[int] = None
    ):
        """
        Initializes a DQN agent
        
        Args:
            network: a neural network to learn the Q-function
            actions: number of actions the agent can take
            logger: a logger that has a write method which receives scalars and a timestep
            learning_rate: the learning rate for the optimizer
            replay_start_size: minimum number of samples in memory before optimization starts, is also the
                number of time steps taken before reducing epsilon
            replay_size: maximum size of the replay buffer
            batch_size: number of samples for each parameter update
            sync_target_step: number of policy updates before updating the target network parameters
            update_frequency: number of time steps between each learning step
            gradient_clipping: if True, the gradients are clipped between -1 and 1
            reward_clipping: if True, the rewards are clipped between -1 and 1
            gamma: the discount factor for the MDP
            epsilon_start: value of epsilon at start of training
            epsilon_end: value of epsilon at end of training
            epsilon_end_step: number of time steps where the epsilon is linearly decayed
            epsilon_testing: value of epsilon during testing
            training: if True the agent is training if False is testing
            device: device to be used in pytorch, either gpu` or `cpu`
            seed: the random seed
        """
        
        if seed is not None:
            torch.random.manual_seed(seed)
        
        # selecting the device to use
        self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu")
        print(f"Using {self._device}...")
        
        # creating the target network, eval doesn't do anything since we are not using dropout
        self._policy_network = network.to(self._device)
        self._target_network = deepcopy(self._policy_network).to(self._device)
        self._target_network.eval()
        
        # saving the logger
        if logger is not None:
            self._logger = logger
        
        # initializing the optimizer and saving some optimization related parameters
        self._learning_rate = learning_rate
        # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate)
        self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015)
        # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015)
        self._batch_size = batch_size
        self._sync_target_step = sync_target_step
        self._update_frequency = update_frequency
        self._gradient_clipping = gradient_clipping
        self._loss_fn = torch.nn.L1Loss(reduction="none")
        self._reward_clipping = reward_clipping
        
        # setting the action space
        self._actions = actions
        self._num_steps = 0
        
        # setting the replay buffer
        self._replay_start_size = replay_start_size
        self._replay_size = replay_size
        self._memory = ReplayMemory(size=replay_size, seed=seed)
        
        # setting the MDP parameters
        self._gamma = gamma
        
        # setting the exploration parameters
        self._epsilon_end = epsilon_end
        self._epsilon_diff = epsilon_start - epsilon_end
        self._epsilon_end_step = epsilon_end_step
        self._epsilon_testing = epsilon_testing
        self._epsilon = epsilon_start
        
        # setting the training status
        self._training = training
        
        self._timestep = None
        self._next_timestep = None
    
    def select_action(
        self,
        observation: acme.types.NestedArray,
    ) -> acme.types.NestedArray:
        
        """Selects an action according to the epsilon greedy policy
        """
        
        if self._exploration_rate <= torch.rand(1).item():
            tensor_observation = torch.tensor([observation], dtype=torch.float32, device=self._device)
            
            # the action is selected with probability 1-epsilon according to the policy network
            with torch.no_grad():
                q_values = self._policy_network(tensor_observation)
                return q_values.argmax().item()
        
        else:
            return torch.randint(high=self._actions, size=(1, )).item()
    
    def observe_first(
            self,
            timestep: dm_env.TimeStep,
    ):
        """Observes the first time step
        """
        
        self._next_timestep = timestep
    
    def observe(
            self,
            action: acme.types.NestedArray,
            next_timestep: dm_env.TimeStep,
    ):
        """Observes a time step and saves a transition if the agent is training
        """
        self._timestep = self._next_timestep
        self._next_timestep = next_timestep
        
        if self._training:
            # if the agent is training, saves the transition
            # (state, status, reward, action, next state, next status and next reward)
            transition = (self._timestep, action, self._next_timestep)
            self._memory.push(transition)
            self._num_steps += 1  # increment the number of steps the agent took
            
            # if a logger exists we also log the current epsilon
            if self._logger is not None:
                data = {'epsilon': self._epsilon, 'replay_size': len(self._memory)}
                self._logger.write(data, self._num_steps)
    
    def update(
            self,
            wait: bool = False
    ):
        """Performs a Q-learning update
        
        Args:
            wait: not used since the algorithm is single process
        """
        
        # if the number of steps taken is larger than the initial number of samples needed
        # and the number of steps is a multiple of the update frequency an update is performed
        if (self._num_steps >= self._replay_start_size) and (self._num_steps % self._update_frequency == 0):
            # samples `batch_size` samples from memory
            transitions = self._memory.sample(self._batch_size)
        else:
            return

        device = self._device
        
        curr_transitions, actions, next_transitions = list(zip(*transitions))
        
        actions = torch.tensor(actions, device=device)
        rewards = torch.tensor([x.reward for x in next_transitions], device=device)
        curr_observations = torch.stack([torch.from_numpy(x.observation) for x in curr_transitions]).float().to(device)
        next_observations = torch.stack([torch.from_numpy(x.observation) for x in next_transitions]).float().to(device)
        done_mask = torch.tensor([x.last() for x in next_transitions], device=device, dtype=torch.bool)
        
        # perform reward clipping
        if self._reward_clipping:
            rewards = rewards.clamp(-1, 1)
        
        curr_values = self._policy_network(curr_observations)
        
        # the value of the current state is the value of the action that was taken
        curr_state_values = curr_values.gather(1, actions.unsqueeze(-1)).squeeze(-1)
        
        with torch.no_grad():
            next_values = self._target_network(next_observations)

            # the value of the next state is the maximum, we have to take the first element since max
            # returns a tuple of value, index
            next_state_values = next_values.max(1)[0]

            # the value of a terminal state is 0
            next_state_values[done_mask] = 0.0

        # computes the MSE Loss, using Q-learning estimate for state value
        loss = self._loss_fn(curr_state_values, rewards + next_state_values * self._gamma)
        
        loss[loss < 1] = loss[loss < 1] ** 2
        loss = loss.mean()
        
        # resets the gradients computed in the optimizer and does backpropagation
        self._optimizer.zero_grad()
        loss.backward()
        
        # performs gradient clipping
        if self._gradient_clipping:
            for param in self._policy_network.parameters():
                param.grad.data.clamp_(-1, 1)
        
        # updates the parameters
        self._optimizer.step()
        
        # periodically update the network
        if (self._num_steps // self._update_frequency) % self._sync_target_step == 0:
            model_parameters = self._policy_network.state_dict()
            # noinspection PyTypeChecker
            self._target_network.load_state_dict(model_parameters)
        
        if self._logger is not None:
            data = {'loss': loss}
            self._logger.write(data, self._num_steps)
    
    @property
    def _exploration_rate(self):
        """Exploration rate (epsilon) which decays linearly during training
        """
        if self._training:
            time_diff = (self._epsilon_end_step - max(0, self._num_steps - self._replay_start_size))
            
            epsilon = self._epsilon_end + max(0., (self._epsilon_diff * time_diff) / self._epsilon_end_step)
        else:
            epsilon = self._epsilon_testing
        
        self._epsilon = epsilon
        return epsilon
    
    def training(self):
        """Changes the agent mode to train
        """
        self._training = True
        
    def testing(self):
        """Changes the agent mode to test
        """
        self._training = False

Пример #2

Показать файл

class Brain:
    def __init__(self, cfg, tetris):
        self.num_actions = cfg.MODEL.SIZE_ACTION
        self.gamma = cfg.SOLVER.GAMMA
        self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE

        transition = namedtuple('Transicion',
                                ('state', 'action', 'next_state', 'reward'))
        self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition)
        self.model = get_model(cfg)

        self.target_net = copy.deepcopy(self.model)
        self.target_net.load_state_dict(self.model.state_dict())

        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        self.tetris = tetris

    def replay(self):

        if len(self.memory) < self.BATCH_SIZE:
            return

        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        self.model.eval()

        state_action_values = self.model(state_batch).gather(1, action_batch)

        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        next_state_values = torch.zeros(BATCH_SIZE)

        self.target_net.eval()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.model.train()

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_model(self):
        self.target_net.load_state_dict(self.model.state_dict())

    def decide_action(self, state, mino, episode):
        epsilon = 0.41 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.model.eval()
            with torch.no_grad():
                action = self.tetris.get_masked_action(self.model(state), mino)
        else:
            action = torch.LongTensor(
                [[self.tetris.get_random_masked_action(mino)]])

        return action

    def brain_predict(self, state):
        self.model.eval()
        with torch.no_grad():
            action = self.model(state).max(1)[1].view(1, 1)
        return action