Python QNetwork.eval примеры использования

Язык программирования: Python

Пространство имен/Пакет: network

Класс/Тип: QNetwork

Метод/Функция: eval

Примеров на hotexamples.com: 6

Python QNetwork.eval - 6 примеров найдено. Это лучшие примеры Python кода для network.QNetwork.eval, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

QNetwork(11)

eval(6)

parameters(5)

train(4)

load_state_dict(3)

get_weights(2)

set_weights(2)

state_dict(2)

bias_variable(1)

build(1)

conv2d(1)

max_pool_2x2(1)

summary(1)

variable_summaries(1)

weight_variable(1)

Пример #1

Показать файл

Файл: test.py Проект: naru380/dqn-tetris

class TrainedBrain():
    def __init__(self, parmas):
        self.num_actions = params['num_actions']
        self.device = params['device']
        self.path_model = params['path_model']
        self.policy_net = QNetwork(self.num_actions).to(self.device)
        self.policy_net.load_state_dict(
            torch.load(self.path_model, map_location=self.device))
        self.policy_net.eval()

    def decide_action(self, state):
        with torch.no_grad():
            self.q_vals = self.policy_net(
                torch.from_numpy(state.copy()).float().to(
                    self.device).unsqueeze(0))

        return int(self.q_vals.max(1)[1].view(1, 1))

Пример #2

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).long().to(device)
        rewards = torch.from_numpy(rewards).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        dones = torch.from_numpy(dones).float().to(device)

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #3

Показать файл

Файл: agent.py Проект: dcfwight/reinforcement_learning_bananas

class Agent():
    '''Interacts and learns from the environment'''
    
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed(int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # initialise the timestep (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in the replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step +1) % UPDATE_EVERY
        
        if self.t_step ==0:
            # Get random subset from the memory, but ONLY if there are enough samples
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy
        Params
        ------
            state(array_like): current state
            eps(float): epsilon, epsilon-greedy action selection (to keep element of exploration)
        """
        # convert the state from the Unity network into a torch tensor
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # Note to pass it through the deep network, we need to take the numpy array and:
        # 1 - convert it to torch array with from_numpy()
        # 2 - convert it to float 32 as that is what is expected. Use .float()
        # 3 - Add a dimension on axis 0 with .unsqueeze(0). Because pytorch expects a BATCH of 1 dimensional arrays
        # to be fed into its network. For example feeding in a batch of 64 arrays, each of length 37. In our case,
        # with reinforcement learning we are only feeding one at a time, but the network still expects it to be 2D.
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
        """Update value paratmers of the deep-Q network using given batch of experience tuples
        
        Params
        ------
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        
        # get the max predicted Q values for the next states, from the target model
        # note: detach just detaches the tensor from the grad_fn - i.e. we are going to do some non-tracked
        # computations based on the value of this tensor (we DON'T update the target model at this stage)
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1-dones))
        
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimise the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
    
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model (Pytorch model): weights will be copied from
            taret_model (Pytorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau) * target_param.data)

Пример #4

Показать файл

Файл: brain.py Проект: naru380/dqn-tetris

class Brain:
    def __init__(self, params):
        self.num_actions = params['num_actions']
        self.device = params['device']
        self.batch_size = params['batch_size']
        self.learning_rate = params['learning_rate']
        self.gamma = params['gamma']
        self.eps_start = params['eps_start']
        self.eps_end = params['eps_end']
        self.eps_decay = params['eps_decay']
        self.policy_net = QNetwork(self.num_actions).to(self.device)
        self.target_net = QNetwork(self.num_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.memory = ReplayMemory(params['replay_memory_size'])
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)
        self.steps_done = 0
        self.q_vals = [0] * self.num_actions
        self.loss = 0

    def decide_action(self, state):
        eps_threshold = self.eps_end + (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        with torch.no_grad():
            self.q_vals = self.policy_net(
                torch.from_numpy(state).float().to(self.device).unsqueeze(0))
        sample = random.random()
        if sample > eps_threshold:
            with torch.no_grad():
                return self.q_vals.max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.num_actions)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([
            torch.tensor(s, device=self.device, dtype=torch.float)
            for s in batch.next_state if s is not None
        ])

        state_batch = torch.cat(
            [torch.tensor(batch.state, device=self.device, dtype=torch.float)])
        action_batch = torch.cat(
            [torch.tensor(batch.action, device=self.device, dtype=torch.long)])
        reward_batch = torch.cat(
            [torch.tensor(batch.reward, device=self.device, dtype=torch.int)])

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch.unsqueeze(1))

        next_state_values = torch.zeros(self.batch_size, device=self.device)

        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.unsqueeze(1)).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        self.loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

Пример #5

Показать файл

Файл: model.py Проект: ruisp666/udacity_projects

class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        def loss_dqn(output, target):
            loss = target - output
            return (target - output)**2

        states, actions, rewards, next_states, dones = experiences

        # Reset gradients
        # Calculate the value of the target in the next state
        pred = self.qnetwork_target(next_states)  # (64, 4)
        target = rewards  # (64, 1)
        for i in range(BATCH_SIZE):
            # Check for dones
            if dones[i] == False:
                target[i] = rewards[i] + GAMMA * torch.max(pred[i])
        # The loss
        output = self.qnetwork_local(states)

        # Use gather in order to have the correct slicing
        output_action_value = output.gather(1, actions.view(-1, 1))
        loss = loss_dqn(output_action_value, target).mean()
        # Reset gradients
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #6

Показать файл

class Agent(object):
    def __init__(self, state_size, action_size, seed, config):
        self.state_size = state_size
        self.action_size = action_size
        self.config = config
        self.seed = random.seed(seed)

        self.local_q_net = QNetwork(state_size, action_size, seed).to(device)
        self.target_q_net = QNetwork(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.local_q_net.parameters(),
                                    lr=config["LR"])

        self.memory = ReplayBuffer(action_size, config["BUFFER_SIZE"],
                                   config["BATCH_SIZE"], seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config["UPDATE_EVERY"]

        if self.t_step == 0:
            # if agent experienced enough
            if len(self.memory) > self.config["BATCH_SIZE"]:
                experiences = self.memory.sample()
                # Learn from previous experiences
                self.learn(experiences, self.config["GAMMA"])

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_q_net.eval()
        with torch.no_grad():
            action_values = self.local_q_net(state)
        self.local_q_net.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        # Double Q Learning

        states, actions, rewards, next_states, dones = experiences

        # Get next action estimation with local q network
        q_targets_next_expected = self.local_q_net(next_states).detach()
        q_targets_next_expected_actions = q_targets_next_expected.max(
            1)[1].unsqueeze(1)

        # Calculate Next Targets
        q_targets_next = self.target_q_net(next_states).gather(
            1, q_targets_next_expected_actions)

        # Non over-estimated targets
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Expected value
        q_expected = self.local_q_net(states).gather(1, actions)

        loss = torch.nn.functional.mse_loss(q_expected, q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.local_q_net, self.target_q_net,
                         self.config["TAU"])

    def soft_update(self, local_net, target_net, tau):
        for target_param, local_param in zip(target_net.parameters(),
                                             local_net.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)