示例#1
0
class Agent(nn.Module):
    def __init__(self,
                 input_shape,
                 num_actions,
                 device,
                 PATH,
                 gamma=0.95,
                 learning_rate=0.001,
                 replay_size=10000,
                 batch_size=128):
        super(Agent, self).__init__()

        self.device = device
        self.PATH = PATH
        self.gamma = gamma
        self.lr = learning_rate
        self.num_actions = num_actions

        epsilon_start = 1.0
        epsilon_final = 0.01
        epsilon_decay = 200
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.replay_size = replay_size
        self.batch_size = batch_size

        self.policy_net = DQN(input_shape, num_actions).to(device)
        self.target_net = DQN(input_shape, num_actions).to(device)

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        self.replay_buffer = ReplayBuffer(replay_size)

        self.best_loss = 9999

    def declare_networks(self):
        self.policy_net = DQN(input_shape, num_actions).to(device)
        self.target_net = DQN(input_shape, num_actions).to(device)

    def declare_memory(self):
        self.replay_buffer = ReplayBuffer(self.replay_size)

    def compute_loss(self):
        if len(self.replay_buffer) > self.batch_size:
            state, action, reward, next_state, done = self.replay_buffer.sample(
                self.batch_size)

            state = Variable(torch.Tensor(np.array(state))).to(self.device)
            action = Variable(torch.LongTensor(action)).to(self.device)
            reward = Variable(torch.Tensor(np.array(reward))).to(self.device)
            next_state = Variable(torch.Tensor(np.array(next_state))).to(
                self.device)
            done = Variable(torch.Tensor(np.array(done))).to(self.device)

            q_values = self.policy_net(state)
            q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

            with torch.no_grad():
                next_q_values = self.policy_net(next_state)
                next_q_state_values = self.target_net(next_state)
                next_q_value = next_q_state_values.gather(
                    1,
                    torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

            expected_q_value = reward + self.gamma * next_q_value * (1 - done)

            # MSE
            loss = (q_value - expected_q_value.detach()).pow(2).mean()

            self.optimizer.zero_grad()
            loss.backward()
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

            if loss < self.best_loss:
                self.model_save()
                self.best_loss = loss

            return loss.item()
        else:
            return 9999

    def append_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def get_action(self, state, episode):
        epsilon = self.epsilon_by_frame(episode)
        with torch.no_grad():
            if random.random() > epsilon:
                #state   = Variable(torch.Tensor(np.array(state))).to(device)
                q_value = self.policy_net(state)
                action = q_value.max(1)[1].item()
            else:
                action = np.random.randint(0, self.num_actions)

        return action

    def update_target_model(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def model_save(self):
        torch.save(
            {
                'model_state_dict': self.policy_net.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
            }, self.PATH)

    def model_load(self):
        if self.device == "cuda:0":
            checkpoint = torch.load(self.PATH)
        else:
            checkpoint = torch.load(self.PATH,
                                    map_location=torch.device('cpu'))

        self.policy_net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
class Agent:
    """Reinforcement Learning Agent that learns using DDPG"""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor Policy Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model params with local model params
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Noise Process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm Parameters
        self.gamma = 0.99  # Discount Factor
        self.tau = 0.01  # for Soft Update of Target Parameters

        self.score = 0
        self.best_score = -np.inf
        self.count = 0
        self.total_reward = 0.0

    def reset_episode(self):
        self.count = 0
        self.total_reward = 0.0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Save Experience / Reward"""
        self.count += 1
        self.total_reward += reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are present in memory
        if len(self.memory) > self.batch_size:
            self.score = reward
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over the last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # Add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        action_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, action_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train the Actor Model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        if self.score > self.best_score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft Update Model Parameters"""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)