Python DQN.eval примеры использования

Язык программирования: Python

Пространство имен/Пакет: models

Класс/Тип: DQN

Метод/Функция: eval

Примеров на hotexamples.com: 6

Python DQN.eval - 6 примеров найдено. Это лучшие примеры Python кода для models.DQN.eval, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DQN(30)

parameters(21)

load_state_dict(13)

eval(6)

cuda(3)

forward(3)

predict(2)

optimize(2)

memorize(2)

ActionPredictionNetwork(1)

load(1)

save(1)

perceive(1)

non_batch_forward(1)

load_checkpoint(1)

get_batch_hidden_state(1)

learn(1)

get_new_hidden_state(1)

get_action(1)

egreedy_action(1)

choose_action(1)

apply(1)

action(1)

__init__(1)

ReplayBuffer(1)

ReccurentDDQN(1)

DuelingDQN(1)

save_checkpoint(1)

Пример #1

Показать файл

Файл: dqn.py Проект: Kevin-Chen0/pommerman-playground

    if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
        # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D
        batch = random.sample(D, BATCH_SIZE)
        batch = {
            k: torch.cat([d[k] for d in batch], dim=0)
            for k in batch[0].keys()
        }

        # Compute targets
        y = batch['reward'] + DISCOUNT * (1 - batch['done']) * target_agent(
            batch['next_state']).max(dim=1)[0]

        # Update Q-function by one step of gradient descent
        value_loss = (
            agent(batch['state']).gather(1, batch['action']).squeeze(dim=1) -
            y).pow(2).mean()
        optimiser.zero_grad()
        value_loss.backward()
        optimiser.step()

    if step > UPDATE_START and step % TARGET_UPDATE_INTERVAL == 0:
        # Update target network
        target_agent = create_target_network(agent)

    if step > UPDATE_START and step % TEST_INTERVAL == 0:
        agent.eval()
        total_reward = test(agent)
        pbar.set_description('Step: %i | Reward: %f' % (step, total_reward))
        plot(step, total_reward, 'dqn')
        agent.train()

Пример #2

Показать файл

class QAgent:
    def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions,
                 learning_rate, gamma, batch_size, replay_memory_size,
                 hidden_size, model_input_size, use_PER, use_ICM):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_anneal_over_steps = epsilon_anneal

        self.num_actions = nb_actions

        self.gamma = gamma

        self.batch_size = batch_size

        self.learning_rate = learning_rate

        self.step_no = 0

        self.policy = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target.load_state_dict(self.policy.state_dict())
        self.target.eval()
        self.hidden_size = hidden_size
        self.optimizer = torch.optim.AdamW(self.policy.parameters(),
                                           lr=self.learning_rate)

        self.use_PER = use_PER
        if use_PER:
            self.replay = Prioritized_Replay_Memory(replay_memory_size)
        else:
            self.replay = Replay_Memory(replay_memory_size)

        self.loss_function = torch.nn.MSELoss()
        self.use_ICM = use_ICM
        if use_ICM:
            self.icm = ICM(model_input_size, nb_actions)

    # Get the current epsilon value according to the start/end and annealing values
    def get_epsilon(self):
        eps = self.epsilon_end
        if self.step_no < self.epsilon_anneal_over_steps:
            eps = self.epsilon_start - self.step_no * \
                ((self.epsilon_start - self.epsilon_end) /
                 self.epsilon_anneal_over_steps)
        return eps

    # select an action with epsilon greedy
    def select_action(self, state):
        self.step_no += 1
        if np.random.uniform() > self.get_epsilon():
            with torch.no_grad():
                return torch.argmax(self.policy(state)).view(1)
        else:
            return torch.tensor([random.randrange(self.num_actions)],
                                device=self.device,
                                dtype=torch.long)

    # update the model according to one step td targets
    def update_model(self):
        if self.use_PER:
            batch_index, batch, ImportanceSamplingWeights = self.replay.sample(
                self.batch_size)
        else:
            batch = self.replay.sample(self.batch_size)

        batch_tuple = Transition(*zip(*batch))

        state = torch.stack(batch_tuple.state)
        action = torch.stack(batch_tuple.action)
        reward = torch.stack(batch_tuple.reward)
        next_state = torch.stack(batch_tuple.next_state)
        done = torch.stack(batch_tuple.done)

        self.optimizer.zero_grad()
        if self.use_ICM:
            self.icm.optimizer.zero_grad()
            forward_loss = self.icm.get_forward_loss(state, action, next_state)
            inverse_loss = self.icm.get_inverse_loss(state, action, next_state)
            icm_loss = (1 - self.icm.beta) * inverse_loss.mean(
            ) + self.ICM.beta * forward_loss.mean()

        td_estimates = self.policy(state).gather(1, action).squeeze()

        td_targets = reward + (1 - done.float()) * self.gamma * \
            self.target(next_state).max(1)[0].detach_()

        if self.use_PER:

            loss = (torch.tensor(ImportanceSamplingWeights, device=self.device)
                    * self.loss_function(td_estimates, td_targets)
                    ).sum() * self.loss_function(td_estimates, td_targets)

            errors = td_estimates - td_targets
            self.replay.batch_update(batch_index, errors.data.numpy())
        else:
            loss = self.loss_function(td_estimates, td_targets)

        if self.use_ICM:
            loss = self.icm.lambda_weight * loss + icm_loss

        loss.backward()

        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.use_ICM:
            self.icm.optimizer.step()

        self.optimizer.step()

        return loss.item()

    # set target net parameters to policy net parameters
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())

    # save model
    def save(self, path, name):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, os.path.join(path, name + ".pt"))
        torch.save(self.policy.state_dict(), filename)

    # load a model
    def load(self, path):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, path)
        self.policy.load_state_dict(torch.load(filename))

    # store experience in replay memory
    def cache(self, state, action, reward, next_state, done):
        self.replay.push(state, action, reward, next_state, done)

Пример #3

Показать файл

Файл: dqn_agent.py Проект: hafizur-rahman/deep-reinforcement-learning

class DQNAgent():
    """Interacts with and learns from the environment."""

    def __init__(self, name, state_size, action_size, use_double_dqn=False, use_dueling=False, seed=0, lr_decay=0.9999, use_prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.use_double_dqn = use_double_dqn
        self.use_dueling = use_dueling
        self.seed = random.seed(seed)
        self.use_prioritized_replay = use_prioritized_replay

        # Q-Network
        if use_dueling:
            self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size, seed).to(device)

        self.qnetwork_target.eval()
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay)

        # Replay memory
        if self.use_prioritized_replay:
            self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, seed, alpha=0.2, beta=0.8, beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(BUFFER_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Epsilon-greedy action selection
        if random.random() > eps:
            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()
        
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        if self.use_prioritized_replay:
            states, actions, rewards, next_states, dones, indices, weights = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            # Get max predicted Q values (for next states) from target model
            if self.use_double_dqn:            
                best_local_actions = self.qnetwork_local(states).max(1)[1].unsqueeze(1)
                Q_targets_next = self.qnetwork_target(next_states).gather(1, best_local_actions).max(1)[0].unsqueeze(1)
            else:
                Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            # Compute Q targets for current states 
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.use_prioritized_replay:
            Q_targets.sub_(Q_expected)
            Q_targets.squeeze_()
            Q_targets.pow_(2)

            with torch.no_grad():
                td_error = Q_targets.detach()
                #td_error.pow_(0.5)
                td_error.mul_(weights)

                self.memory.update_priorities(indices, td_error)

            Q_targets.mul_(weights)
            loss = Q_targets.mean()
        else:                
            # Compute loss
            loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #4

Показать файл

Файл: htrg_crank_drank.py Проект: wfule/collusion

import torch.optim as optim

import copy
import pickle

from utils import *
from models import DQN

initial_Q = AER_initial_Q()
# initial_Q = torch.zeros(n_actions, device=device)

policy_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device)
target_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters())

Q = torch.zeros(n_actions, n_actions, n_actions, device=device)
for i in range(n_actions):
    for j in range(n_actions):
        Q[i, j, :] = initial_Q.view(-1)

memory = ReplayMemory(MEM_SIZE)

heat = torch.zeros(n_agents, n_actions, n_actions, device=device)

heat_unique0 = []
heat_freq0 = []
heat_unique1 = []
heat_freq1 = []

Пример #5

Показать файл

class FixedDQNAgent(DQNAgent):
    """
    DQN Agent with a target network to compute Q-targets.
    Extends DQNAgent.
    """
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 target_update=100,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(input_dim, output_dim, lr, gamma, max_memory_size,
                         batch_size, eps_start, eps_end, eps_decay, device,
                         linear1_units, linear2_units, decay_type)

        self.model_name = "FixedDQN"

        self.target_update_freq = target_update
        # networks
        self.output_dim = output_dim
        self.target_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.updated = 0

    def learn(self):
        """
        Update the weights of the network, using
        target_net to compute Q-targets. Every self.target_update_freq
        updates, clone the policy_net.
        :return: the loss
        """
        states, next_states, actions, rewards, dones = self.memory.sample(
            self.batch_size)

        curr_q_vals = self.policy_net(states).gather(1, actions)
        next_q_vals = self.target_net(next_states).max(
            1, keepdim=True)[0].detach()
        target = (rewards + self.gamma * next_q_vals * (1 - dones)).to(
            self.device)
        loss = F.smooth_l1_loss(curr_q_vals, target)
        self.optim.zero_grad()
        loss.backward()

        self.optim.step()

        self.updated += 1

        if self.updated % self.target_update_freq == 0:
            self.target_hard_update()

        return loss.item()

    def target_hard_update(self):
        """ Clone the policy net weights into the target net """
        self.target_net.load_state_dict(self.policy_net.state_dict())

Пример #6

Показать файл

class DQNAgent(BaseAgent):
    """
    Agent with a DQN network.
    """
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(max_memory_size, batch_size, eps_start, eps_end,
                         eps_decay, device, decay_type)

        self.model_name = "DQN"
        self.output_dim = output_dim
        self.policy_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)

        # optimizer
        self.optim = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma

    def choose_action(self, state, testing=False):
        """
        Choose an action to perform. Uses eps-greedy approach.
        :param state: current state of the environment
        :param testing: if True, always choose greedy action
        :return: the action chosen
        """
        self.curr_step += 1

        if not testing and np.random.random() < self.curr_eps:
            return np.random.randint(0, self.output_dim)
        else:
            # we're using the network for inference only, we don't want to track the gradients in this case
            with torch.no_grad():
                return self.policy_net(state).argmax().item()

    def learn(self):
        """
        Update the weights of the network.
        :return: the loss
        """
        states, next_states, actions, rewards, dones = self.memory.sample(
            self.batch_size)

        curr_q_vals = self.policy_net(states).gather(1, actions)
        next_q_vals = self.policy_net(next_states).max(
            1, keepdim=True)[0].detach()
        target = (rewards + self.gamma * next_q_vals * (1 - dones)).to(
            self.device)
        loss = F.smooth_l1_loss(curr_q_vals, target)
        self.optim.zero_grad()
        loss.backward()

        self.optim.step()

        return loss.item()

    def set_test(self):
        """ Sets the network in evaluation mode """
        self.policy_net.eval()

    def set_train(self):
        """ Sets the network in training mode """
        self.policy_net.train()

    def save(self, filename):
        """
        Save the network weights.
        :param filename: path
        """
        self.policy_net.save(filename)

    def load(self, filename):
        """
        Load the network weights.
        :param filename: path of the weight file
        """
        self.policy_net.load(filename, self.device)