Python DuelingDQN.forward примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: DuelingDQN

Метод/Функция: forward

Примеров на hotexamples.com: 2

Python DuelingDQN.forward - 2 примера найдено. Это лучшие примеры Python кода для model.DuelingDQN.forward, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DuelingDQN(30)

load_state_dict(19)

parameters(14)

act(10)

state_dict(7)

eval(6)

train(6)

forward(2)

build_model(1)

named_parameters(1)

predict(1)

update(1)

zero_grad(1)

Пример #1

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingDQN(state_size, action_size,
                                         seed).to(device)
        self.qnetwork_target = DuelingDQN(state_size, action_size,
                                          seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        self.priority_alpha = 0.0  #current best: 03
        self.priority_beta_start = 0.4
        self.priority_beta_frames = BUFFER_SIZE

        # Replay memory
        self.memory = PrioritizedReplayMemory(BUFFER_SIZE, self.priority_alpha,
                                              self.priority_beta_start,
                                              self.priority_beta_frames)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.push((state, action, reward, next_state, done))

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.storage_size() > BATCH_SIZE:
                #print("storage == ", self.memory.storage_size())
                experiences, idxes, weights = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, idxes, weights, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, idxes, weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = zip(*experiences)

        states = torch.from_numpy(
            np.vstack([state for state in states
                       if state is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([action for action in actions
                       if action is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([reward for reward in rewards
                       if reward is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([
                next_state for next_state in next_states
                if next_state is not None
            ])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([done for done in dones if done is not None
                       ]).astype(np.uint8)).float().to(device)

        # Get max predicted Q values (for next states) from target model
        #print("state-action values:")
        #print(self.qnetwork_target(next_states).detach())
        #print(next_states)
        next_target_Q = self.qnetwork_target.forward(next_states)
        #print("next_target_Q == ", next_target_Q)

        _, next_local_Q_index = torch.max(
            self.qnetwork_local.forward(next_states), axis=1)

        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        Q_targets_next = next_target_Q[range(next_target_Q.shape[0]),
                                       next_local_Q_index]

        Q_targets_next1 = Q_targets_next.reshape((len(Q_targets_next), 1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next1 * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #print(Q_expected)
        #print(Q_targets)

        diff = Q_expected - Q_targets
        #print(diff)
        #diff = diff.mean()
        #print(idxes)
        #print(diff.detach().squeeze().abs().cpu().numpy().tolist())
        #update the priority of the replay buffer

        self.memory.update_priorities(
            idxes,
            diff.detach().squeeze().abs().cpu().numpy().tolist())

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets) * weights
        loss = loss.mean()

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #2

Показать файл

Файл: double_dueling_dqn_agent.py Проект: nlddfn/Udacity_RL_P1_D3QN

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 gamma=GAMMA,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE,
                 update_every=UPDATE_EVERY,
                 lr=LR,
                 tau=TAU
    ):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.batch_size = batch_size

        # Q-Network
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_local = DuelingDQN(state_size, action_size, seed).to(self.device)
        self.model_target = DuelingDQN(state_size, action_size, seed).to(self.device)
        self.optimizer = optim.Adam(self.model_local.parameters(), lr=LR)
    
        # Replay memory
        self.memory = ReplayBuffer(
            action_size=action_size,
            buffer_size=BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            seed=seed,
            device=self.device
        )
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.update(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        
        self.model_local.eval()
        with torch.no_grad():
            qvals = self.model_local.forward(state)
        self.model_local.train()
        
        # Epsilon-greedy action selection
        if random.random() > eps:
            action = np.argmax(qvals.cpu().detach().numpy())
            return action
        else:
            return random.choice(np.arange(self.action_size))
    

    def update(self, batch):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            batch (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = batch
        
        # Get expected Q values from local model
        curr_Q = self.model_local.forward(states).gather(1, actions)
#         curr_Q = curr_Q.squeeze(1)
        
        # Get max predicted Q values (for next states) from target model
        max_next_Q = self.model_target.forward(next_states).detach().max(1)[0].unsqueeze(1)
        expected_Q = rewards + (self.gamma * max_next_Q * (1 - dones))

        loss = F.mse_loss(curr_Q, expected_Q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # update target model
        self.update_target(self.model_local, self.model_target, TAU)     
    def update_target(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)