Python QNetwork.criterion примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: QNetwork

Метод/Функция: criterion

Примеров на hotexamples.com: 2

Python QNetwork.criterion - 2 примера найдено. Это лучшие примеры Python кода для model.QNetwork.criterion, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

QNetwork(30)

eval(30)

train(30)

state_dict(30)

load_state_dict(30)

parameters(30)

forward(23)

to(8)

set_weights(6)

cuda(5)

get_weights(4)

sample_action(3)

cpu(3)

save_weights(2)

load_weights(2)

decide_action(2)

load_model(2)

items(2)

criterion(2)

trainNet(1)

backward(1)

step(1)

update_mean(1)

spectrum(1)

update_nn(1)

update_target_network(1)

soft_update(1)

restore(1)

set_params(1)

save(1)

sample_actions(1)

qvalue(1)

predict_act(1)

__str__(1)

named_parameters(1)

loss_fn(1)

load(1)

initialize_weights(1)

get_action(1)

foward(1)

update_weights(1)

Пример #1

Показать файл

Файл: agents.py Проект: sachinruk/LunarLander

class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 ddqn=False,
                 dueling=False,
                 priority=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.ddqn = ddqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   priority)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            experiences = self.memory.sample(get_n=UPDATE_EVERY)
            self.update_error(experiences, GAMMA)
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                self.update_error(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, _ = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        if self.ddqn:
            old_val = self.qnetwork_local(states).gather(-1, actions)
            with torch.no_grad():
                actions = self.qnetwork_local(next_states).argmax(-1,
                                                                  keepdim=True)
                maxQ = self.qnetwork_target(next_states).gather(-1, actions)
                target = rewards + gamma * maxQ * (1 - dones)
        else:  # Normal DQN
            with torch.no_grad():
                maxQ = self.qnetwork_target(next_states).max(-1,
                                                             keepdim=True)[0]
                target = rewards + gamma * maxQ * (1 - dones)
            old_val = self.qnetwork_local(states).gather(-1, actions)

        self.optimizer.zero_grad()
        loss = self.qnetwork_local.criterion(old_val, target)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def update_error(self, experiences, gamma):
        states, actions, rewards, next_states, dones, idx = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        if self.ddqn:
            old_val = self.qnetwork_local(states).gather(-1, actions)
            with torch.no_grad():
                actions = self.qnetwork_local(next_states).argmax(-1,
                                                                  keepdim=True)
                maxQ = self.qnetwork_target(next_states).gather(-1, actions)
                target = rewards + gamma * maxQ * (1 - dones)
        else:  # Normal DQN
            with torch.no_grad():
                maxQ = self.qnetwork_target(next_states).max(-1,
                                                             keepdim=True)[0]
                target = rewards + gamma * maxQ * (1 - dones)
            old_val = self.qnetwork_local(states).gather(-1, actions)

        error = torch.abs(old_val - target).detach().numpy().squeeze()
        for i, err in zip(idx, error):
            self.memory.error_buffer[i] = err

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #2

Показать файл

Файл: dqn_agent.py Проект: ascfraguas/Deep_RL_agents

class Agent():
    def __init__(self, state_size, action_size, seed):
        ''' Initialization of the agent '''

        # Initialize state / action space sizes, and the counter for updating
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0

        # Initialize the two Q-networks (the local and target) and define the optimizer
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialize agent's replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, state, action, reward, next_state, done):
        ''' Store experience and learn if it is time to do so '''

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        ''' Returns actions for given state as per current policy '''

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Set the Q-network to evaluation mode (turn off training layers such as dropouts) and do a forward pass for the state
        # without computing gradients. Afterwards, return to training mode
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def soft_update(self, local_model, target_model, tau):
        ''' Soft update for the target Q-network '''

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def learn(self, experiences, gamma):
        ''' Update local network weights using sampled experience tuples (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples '''

        # Unpack experiences
        states, actions, rewards, next_states, dones = experiences

        # Get predictions from local network, i.e. Q-values of the (state, action) pairs in the sampled batch of experiences
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Get training targets, i.e. current reward + max predicted Q-value for next state by target network, for each (state, action)
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Now compute the loss wrt these new targets
        loss = self.qnetwork_local.criterion(Q_expected, Q_targets)
        self.optimizer.zero_grad()  # ?!
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)