class TestReplayMemory(unittest.TestCase):
  def setUp(self):
    self.memory = ReplayMemory(capacity=10)

  def test_append(self):
    for i in range(20):
      a = Transition([0, 1, 2, 3], 0, [4, 5, 6, 7], 0, True)
      self.memory.push(a)
    self.assertEqual(len(self.memory.memory), 10)

  def test_sample(self):
    for i in range(10):
      a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True)
      self.memory.push(a)

    s, a, s1, r, done = self.memory.sample(2)
    self.assertEqual(s.shape, (2, 4))
    self.assertEqual(a.shape, (2, 1))
    self.assertEqual(s1.shape, (2, 4))
    self.assertEqual(r.shape, (2, 1))
    self.assertEqual(done.shape, (2, 1))

  def test_multi_step(self):
    self.memory = ReplayMemory(capacity=10, multi_step_n=2)
    for i in range(5):
      a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False)
      self.memory.push(a)
    final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True)
    self.memory.push(final)
    self.assertEqual(self.memory.memory[0].r, 2.9701)
    self.assertEqual(self.memory.memory[3].r, 11.791)
    self.assertEqual(self.memory.memory[4].r, 10.9)
    self.assertEqual(self.memory.memory[5].r, 10)

  def test_zero_step(self):
    self.memory = ReplayMemory(capacity=10, multi_step_n=0)
    for i in range(5):
      a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False)
      self.memory.push(a)
    final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True)
    self.memory.push(final)
    self.assertEqual(self.memory.memory[0].r, 1)
    self.assertEqual(self.memory.memory[3].r, 1)
    self.assertEqual(self.memory.memory[4].r, 1)
    self.assertEqual(self.memory.memory[5].r, 10)
Exemplo n.º 2
0
class hDQN():
    """
    The Hierarchical-DQN Agent
    Parameters
    ----------
        optimizer_spec: OptimizerSpec
            Specifying the constructor and kwargs, as well as learning rate schedule
            for the optimizer
        num_goal: int
            The number of goal that agent can choose from
        num_action: int
            The number of action that agent can choose from
        replay_memory_size: int
            How many memories to store in the replay memory.
        batch_size: int
            How many transitions to sample each time experience is replayed.
    """
    def __init__(self,
                 optimizer_spec,
                 num_goal=6,
                 num_action=2,
                 replay_memory_size=10000,
                 batch_size=128):
        ###############
        # BUILD MODEL #
        ###############
        self.num_goal = num_goal
        self.num_action = num_action
        self.batch_size = batch_size
        # Construct meta-controller and controller
        self.meta_controller = MetaController().type(dtype)
        self.target_meta_controller = MetaController().type(dtype)
        self.controller = Controller().type(dtype)
        self.target_controller = Controller().type(dtype)
        # Construct the optimizers for meta-controller and controller
        self.meta_optimizer = optimizer_spec.constructor(
            self.meta_controller.parameters(), **optimizer_spec.kwargs)
        self.ctrl_optimizer = optimizer_spec.constructor(
            self.controller.parameters(), **optimizer_spec.kwargs)
        # Construct the replay memory for meta-controller and controller
        self.meta_replay_memory = ReplayMemory(replay_memory_size)
        self.ctrl_replay_memory = ReplayMemory(replay_memory_size)

    def get_intrinsic_reward(self, goal, state):
        return 1.0 if goal == state else 0.0

    def select_goal(self, state, epilson):
        sample = random.random()
        if sample > epilson:
            state = torch.from_numpy(state).type(dtype)
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return self.meta_controller(Variable(
                    state, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([random.randrange(self.num_goal)])

    def select_action(self, joint_state_goal, epilson):
        sample = random.random()
        if sample > epilson:
            joint_state_goal = torch.from_numpy(joint_state_goal).type(dtype)
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return self.controller(
                    Variable(joint_state_goal,
                             volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([random.randrange(self.num_action)])

    def update_meta_controller(self, gamma=1.0):
        if len(self.meta_replay_memory) < self.batch_size:
            return
        state_batch, goal_batch, next_state_batch, ex_reward_batch, done_mask = \
            self.meta_replay_memory.sample(self.batch_size)
        state_batch = Variable(torch.from_numpy(state_batch).type(dtype))
        goal_batch = Variable(torch.from_numpy(goal_batch).long())
        next_state_batch = Variable(
            torch.from_numpy(next_state_batch).type(dtype))
        ex_reward_batch = Variable(
            torch.from_numpy(ex_reward_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)
        if USE_CUDA:
            goal_batch = goal_batch.cuda()
        # Compute current Q value, meta_controller takes only state and output value for every state-goal pair
        # We choose Q based on goal chosen.
        current_Q_values = self.meta_controller(state_batch).gather(
            1, goal_batch.unsqueeze(1))
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = self.target_meta_controller(
            next_state_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = ex_reward_batch + (gamma * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values)

        # Copy Q to target Q before updating parameters of Q
        self.target_meta_controller.load_state_dict(
            self.meta_controller.state_dict())
        # Optimize the model
        self.meta_optimizer.zero_grad()
        loss.backward()
        for param in self.meta_controller.parameters():
            param.grad.data.clamp_(-1, 1)
        self.meta_optimizer.step()

    def update_controller(self, gamma=1.0):
        if len(self.ctrl_replay_memory) < self.batch_size:
            return
        state_goal_batch, action_batch, next_state_goal_batch, in_reward_batch, done_mask = \
            self.ctrl_replay_memory.sample(self.batch_size)
        state_goal_batch = Variable(
            torch.from_numpy(state_goal_batch).type(dtype))
        action_batch = Variable(torch.from_numpy(action_batch).long())
        next_state_goal_batch = Variable(
            torch.from_numpy(next_state_goal_batch).type(dtype))
        in_reward_batch = Variable(
            torch.from_numpy(in_reward_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)
        if USE_CUDA:
            action_batch = action_batch.cuda()
        # Compute current Q value, controller takes only (state, goal) and output value for every (state, goal)-action pair
        # We choose Q based on action taken.
        current_Q_values = self.controller(state_goal_batch).gather(
            1, action_batch.unsqueeze(1))
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = self.target_controller(
            next_state_goal_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = in_reward_batch + (gamma * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values)

        # Copy Q to target Q before updating parameters of Q
        self.target_controller.load_state_dict(self.controller.state_dict())
        # Optimize the model
        self.ctrl_optimizer.zero_grad()
        loss.backward()
        for param in self.controller.parameters():
            param.grad.data.clamp_(-1, 1)
        self.ctrl_optimizer.step()
Exemplo n.º 3
0
            timestep += 1
            epoch_return += reward

            mask = torch.Tensor([done]).to(device)
            reward = torch.Tensor([reward]).to(device)
            next_state = torch.Tensor([next_state]).to(device)

            memory.push(state, action, mask, next_state, reward)

            state = next_state

            epoch_value_loss = 0
            epoch_policy_loss = 0

            if len(memory) > args.batch_size:
                transitions = memory.sample(args.batch_size)
                # Transpose the batch
                # (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
                batch = Transition(*zip(*transitions))

                # Update actor and critic according to the batch
                value_loss, policy_loss = agent.update_params(batch)

                epoch_value_loss += value_loss
                epoch_policy_loss += policy_loss

            if done:
                break

        rewards.append(epoch_return)
        value_losses.append(epoch_value_loss)
Exemplo n.º 4
0
class DQNDoubleQAgent(BaseAgent):
    def __init__(self):
        super(DQNDoubleQAgent, self).__init__()
        self.training = False
        self.max_frames = 2000000
        self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001)
        self.gamma = 0.99
        self.train_q_per_step = 4
        self.train_q_batch_size = 256
        self.steps_before_training = 10000
        self.target_q_update_frequency = 50000

        self._Q_weights_path = "./data/SC2DoubleQAgent"
        self._Q = DQNCNN()
        if os.path.isfile(self._Q_weights_path):
            self._Q.load_state_dict(torch.load(self._Q_weights_path))
            print("Loading weights:", self._Q_weights_path)
        self._Qt = copy.deepcopy(self._Q)
        self._Q.cuda()
        self._Qt.cuda()
        self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8)
        self._criterion = nn.MSELoss()
        self._memory = ReplayMemory(100000)

        self._loss = deque(maxlen=1000)
        self._max_q = deque(maxlen=1000)
        self._action = None
        self._screen = None
        self._fig = plt.figure()
        self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)]

        self._screen_size = 28

    def get_env_action(self, action, obs):
        action = np.unravel_index(action,
                                  [1, self._screen_size, self._screen_size])
        target = [action[2], action[1]]
        command = _MOVE_SCREEN  #action[0]   # removing unit selection out of the equation
        # if command == 0:
        #   command = _SELECT_POINT
        # else:
        #   command = _MOVE_SCREEN

        if command in obs.observation["available_actions"]:
            return actions.FunctionCall(command, [[0], target])
        else:
            return actions.FunctionCall(_NO_OP, [])

    '''
    :param 
      s = obs.observation["screen"]
    :returns
      action = argmax action
  '''

    def get_action(self, s):
        # greedy
        if np.random.rand() > self._epsilon.value():
            # print("greedy action")
            s = Variable(torch.from_numpy(s).cuda())
            s = s.unsqueeze(0).float()
            self._action = self._Q(s).squeeze().cpu().data.numpy()
            return self._action.argmax()
        # explore
        else:
            # print("random choice")
            # action = np.random.choice([0, 1])
            action = 0
            target = np.random.randint(0, self._screen_size, size=2)
            return action * self._screen_size * self._screen_size + target[
                0] * self._screen_size + target[1]

    def select_friendly_action(self, obs):
        player_relative = obs.observation["screen"][_PLAYER_RELATIVE]
        friendly_y, friendly_x = (
            player_relative == _PLAYER_FRIENDLY).nonzero()
        target = [int(friendly_x.mean()), int(friendly_y.mean())]
        return actions.FunctionCall(_SELECT_POINT, [[0], target])

    def train(self, env, training=True):
        self._epsilon.isTraining = training
        self.run_loop(env, self.max_frames)
        if self._epsilon.isTraining:
            torch.save(self._Q.state_dict(), self._Q_weights_path)

    def run_loop(self, env, max_frames=0):
        """A run loop to have agents and an environment interact."""
        total_frames = 0
        start_time = time.time()

        action_spec = env.action_spec()
        observation_spec = env.observation_spec()

        self.setup(observation_spec, action_spec)

        try:
            while True:
                obs = env.reset()[0]
                # remove unit selection from the equation by selecting the friendly on every new game.
                select_friendly = self.select_friendly_action(obs)
                obs = env.step([select_friendly])[0]
                # distance = self.get_reward(obs.observation["screen"])

                self.reset()

                while True:
                    total_frames += 1

                    self._screen = obs.observation["screen"][5]
                    s = np.expand_dims(obs.observation["screen"][5], 0)
                    # plt.imshow(s[5])
                    # plt.pause(0.00001)
                    if max_frames and total_frames >= max_frames:
                        print("max frames reached")
                        return
                    if obs.last():
                        print("total frames:", total_frames, "Epsilon:",
                              self._epsilon.value())
                        self._epsilon.increment()
                        break

                    action = self.get_action(s)
                    env_actions = self.get_env_action(action, obs)
                    obs = env.step([env_actions])[0]

                    r = obs.reward
                    s1 = np.expand_dims(obs.observation["screen"][5], 0)
                    done = r > 0
                    if self._epsilon.isTraining:
                        transition = Transition(s, action, s1, r, done)
                        self._memory.push(transition)

                    if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.train_q()
                        # pass

                    if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self._Qt = copy.deepcopy(self._Q)
                        self.show_chart()

                    if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.show_chart()

                    if not self._epsilon.isTraining and total_frames % 3 == 0:
                        self.show_chart()

        except KeyboardInterrupt:
            pass
        finally:
            print("finished")
            elapsed_time = time.time() - start_time
            print("Took %.3f seconds for %s steps: %.3f fps" %
                  (elapsed_time, total_frames, total_frames / elapsed_time))

    def get_reward(self, s):
        player_relative = s[_PLAYER_RELATIVE]
        neutral_y, neutral_x = (player_relative == _PLAYER_NEUTRAL).nonzero()
        neutral_target = [int(neutral_x.mean()), int(neutral_y.mean())]
        friendly_y, friendly_x = (
            player_relative == _PLAYER_FRIENDLY).nonzero()
        if len(friendly_y) == 0 or len(friendly_x) == 0:  # this is shit
            return 0
        friendly_target = [int(friendly_x.mean()), int(friendly_y.mean())]

        distance_2 = (neutral_target[0] - friendly_target[0])**2 + (
            neutral_target[1] - friendly_target[1])**2
        distance = math.sqrt(distance_2)
        return -distance

    def show_chart(self):
        self._plot[0].clear()
        self._plot[0].set_xlabel('Last 1000 Training Cycles')
        self._plot[0].set_ylabel('Loss')
        self._plot[0].plot(list(self._loss))

        self._plot[1].clear()
        self._plot[1].set_xlabel('Last 1000 Training Cycles')
        self._plot[1].set_ylabel('Max Q')
        self._plot[1].plot(list(self._max_q))

        self._plot[2].clear()
        self._plot[2].set_title("screen")
        self._plot[2].imshow(self._screen)

        self._plot[3].clear()
        self._plot[3].set_title("action")
        self._plot[3].imshow(self._action)
        plt.pause(0.00001)

    def train_q(self):
        if self.train_q_batch_size >= len(self._memory):
            return

        s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size)
        s = Variable(torch.from_numpy(s).cuda()).float()
        a = Variable(torch.from_numpy(a).cuda()).long()
        s_1 = Variable(torch.from_numpy(s_1).cuda(), volatile=True).float()
        r = Variable(torch.from_numpy(r).cuda()).float()
        done = Variable(torch.from_numpy(1 - done).cuda()).float()

        # Q_sa = r + gamma * max(Q_s'a')
        Q = self._Q(s)
        Q = Q.view(self.train_q_batch_size, -1)
        Q = Q.gather(1, a)

        Qt = self._Qt(s_1).view(self.train_q_batch_size, -1)

        # double Q
        best_action = self._Q(s_1).view(self.train_q_batch_size,
                                        -1).max(dim=1, keepdim=True)[1]
        y = r + done * self.gamma * Qt.gather(1, best_action)
        # Q
        # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1)

        y.volatile = False

        loss = self._criterion(Q, y)
        self._loss.append(loss.sum().cpu().data.numpy())
        self._max_q.append(Q.max().cpu().data.numpy()[0])
        self._optimizer.zero_grad()  # zero the gradient buffers
        loss.backward()
        self._optimizer.step()
Exemplo n.º 5
0
class NEC:
    def __init__(self, env, args, device='cpu'):
        """
        Instantiate an NEC Agent
        ----------
        env: gym.Env
            gym environment to train on
        args: args class from argparser
            args are from from train.py: see train.py for help with each arg
        device: string
            'cpu' or 'cuda:0' depending on use_cuda flag from train.py
        """
        self.environment_type = args.environment_type
        self.env = env
        self.device = device
        # Hyperparameters
        self.epsilon = args.initial_epsilon
        self.final_epsilon = args.final_epsilon
        self.epsilon_decay = args.epsilon_decay
        self.gamma = args.gamma
        self.N = args.N
        # Transition queue and replay memory
        self.transition_queue = []
        self.replay_every = args.replay_every
        self.replay_buffer_size = args.replay_buffer_size
        self.replay_memory = ReplayMemory(self.replay_buffer_size)
        # CNN for state embedding network
        self.frames_to_stack = args.frames_to_stack
        self.embedding_size = args.embedding_size
        self.in_height = args.in_height
        self.in_width = args.in_width
        self.cnn = CNN(self.frames_to_stack, self.embedding_size,
                       self.in_height, self.in_width).to(self.device)
        # Differentiable Neural Dictionary (DND): one for each action
        self.kernel = inverse_distance
        self.num_neighbors = args.num_neighbors
        self.max_memory = args.max_memory
        self.lr = args.lr
        self.dnd_list = []
        for i in range(env.action_space.n):
            self.dnd_list.append(
                DND(self.kernel, self.num_neighbors, self.max_memory,
                    args.optimizer, self.lr))
        # Optimizer for state embedding CNN
        self.q_lr = args.q_lr
        self.batch_size = args.batch_size
        self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(),
                                       self.lr)

    def choose_action(self, state_embedding):
        """
        Choose epsilon-greedy policy according to Q-estimates from DNDs
        """
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.env.action_space.n - 1)
        else:
            qs = [dnd.lookup(state_embedding) for dnd in self.dnd_list]
            action = torch.argmax(torch.cat(qs))
            return action

    def Q_lookahead(self, t, warmup=False):
        """
        Return the N-step Q-value lookahead from time t in the transition queue
        """
        if warmup or len(self.transition_queue) <= t + self.N:
            lookahead = [tr.reward for tr in self.transition_queue[t:]]
            discounted = discount(lookahead, self.gamma)
            Q_N = torch.tensor([discounted], requires_grad=True)
            return Q_N
        else:
            lookahead = [
                tr.reward for tr in self.transition_queue[t:t + self.N]
            ]
            discounted = discount(lookahead, self.gamma)
            state = self.transition_queue[t + self.N].state
            state = torch.tensor(state).permute(2, 0,
                                                1).unsqueeze(0)  # (N,C,H,W)
            state = state.to(self.device)
            state_embedding = self.cnn(state)
            Q_a = [dnd.lookup(state_embedding) for dnd in self.dnd_list]
            maxQ = torch.cat(Q_a).max()
            Q_N = discounted + (self.gamma**self.N) * maxQ
            Q_N = torch.tensor([Q_N], requires_grad=True)
            return Q_N

    def Q_update(self, Q, Q_N):
        """
        Return the Q-update for DND updates
        """
        return Q + self.q_lr * (Q_N - Q)

    def update(self):
        """
        Iterate through the transition queue and make NEC updates
        """
        # Insert transitions into DNDs
        for t in range(len(self.transition_queue)):
            tr = self.transition_queue[t]
            action = tr.action
            tr = self.transition_queue[t]
            state = torch.tensor(tr.state).permute(2, 0, 1)  # (C,H,W)
            state = state.unsqueeze(0).to(self.device)  # (N,C,H,W)
            state_embedding = self.cnn(state)
            dnd = self.dnd_list[action]

            Q_N = self.Q_lookahead(t).to(self.device)
            embedding_index = dnd.get_index(state_embedding)
            if embedding_index is None:
                dnd.insert(state_embedding.detach(), Q_N.detach().unsqueeze(0))
            else:
                Q = self.Q_update(dnd.values[embedding_index], Q_N)
                dnd.update(Q.detach(), embedding_index)
            Q_N = Q_N.detach().to(self.device)
            self.replay_memory.push(tr.state, action, Q_N)
        # Commit inserts
        for dnd in self.dnd_list:
            dnd.commit_insert()
        # Train CNN on minibatch
        for t in range(len(self.transition_queue)):
            if t % self.replay_every == 0 or t == len(
                    self.transition_queue) - 1:
                # Train on random mini-batch from self.replay_memory
                batch = self.replay_memory.sample(self.batch_size)
                actual_Qs = torch.cat([sample.Q_N for sample in batch])
                predicted_Qs = []
                for sample in batch:
                    state = torch.tensor(sample.state).permute(2, 0,
                                                               1)  # (C,H,W)
                    state = state.unsqueeze(0).to(self.device)  # (N,C,H,W)
                    state_embedding = self.cnn(state)
                    dnd = self.dnd_list[sample.action]
                    predicted_Q = dnd.lookup(state_embedding, update_flag=True)
                    predicted_Qs.append(predicted_Q)
                predicted_Qs = torch.cat(predicted_Qs).to(self.device)
                loss = torch.dist(actual_Qs, predicted_Qs)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                for dnd in self.dnd_list:
                    dnd.update_params()

        # Clear out transition queue
        self.transition_queue = []

    def run_episode(self):
        """
        Train an NEC agent for a single episode:
            Interact with environment
            Append (state, action, reward) transitions to transition queue
            Call update at the end of the episode
        """
        if self.epsilon > self.final_epsilon:
            self.epsilon = self.epsilon * self.epsilon_decay
        state = self.env.reset()
        if self.environment_type == 'fourrooms':
            fewest_steps = self.env.shortest_path_length(self.env.state)
        total_steps = 0
        total_reward = 0
        total_frames = 0
        done = False
        while not done:
            state_embedding = torch.tensor(state).permute(2, 0, 1)  # (C,H,W)
            state_embedding = state_embedding.unsqueeze(0).to(self.device)
            state_embedding = self.cnn(state_embedding)
            action = self.choose_action(state_embedding)
            next_state, reward, done, _ = self.env.step(action)
            self.transition_queue.append(Transition(state, action, reward))
            total_reward += reward
            total_frames += self.env.skip
            total_steps += 1
            state = next_state
        self.update()
        if self.environment_type == 'fourrooms':
            n_extra_steps = total_steps - fewest_steps
            return n_extra_steps, total_frames, total_reward
        else:
            return total_frames, total_reward

    def warmup(self):
        """
        Warmup the DND with values from an episode with a random policy
        """
        state = self.env.reset()
        total_reward = 0
        total_frames = 0
        done = False
        while not done:
            action = random.randint(0, self.env.action_space.n - 1)
            next_state, reward, done, _ = self.env.step(action)
            total_reward += reward
            total_frames += self.env.skip
            self.transition_queue.append(Transition(state, action, reward))
            state = next_state

        for t in range(len(self.transition_queue)):
            tr = self.transition_queue[t]
            state_embedding = torch.tensor(tr.state).permute(2, 0,
                                                             1)  # (C,H,W)
            state_embedding = state_embedding.unsqueeze(0).to(self.device)
            state_embedding = self.cnn(state_embedding)
            action = tr.action
            dnd = self.dnd_list[action]

            Q_N = self.Q_lookahead(t, True).to(self.device)
            if dnd.keys_to_be_inserted is None and dnd.keys is None:
                dnd.insert(state_embedding, Q_N.detach().unsqueeze(0))
            else:
                embedding_index = dnd.get_index(state_embedding)
                if embedding_index is None:
                    state_embedding = state_embedding.detach()
                    dnd.insert(state_embedding, Q_N.detach().unsqueeze(0))
                else:
                    Q = self.Q_update(dnd.values[embedding_index], Q_N)
                    dnd.update(Q.detach(), embedding_index)
            self.replay_memory.push(tr.state, action, Q_N.detach())
        for dnd in self.dnd_list:
            dnd.commit_insert()
        # Clear out transition queue
        self.transition_queue = []
        return total_frames, total_reward
Exemplo n.º 6
0
class DDPG():
    """
    The Deep Deterministic Policy Gradient (DDPG) Agent
    Parameters
    ----------
        actor_optimizer_spec: OptimizerSpec
            Specifying the constructor and kwargs, as well as learning rate and other
            parameters for the optimizer
        critic_optimizer_spec: OptimizerSpec
        num_feature: int
            The number of features of the environmental state
        num_action: int
            The number of available actions that agent can choose from
        replay_memory_size: int
            How many memories to store in the replay memory.
        batch_size: int
            How many transitions to sample each time experience is replayed.
        tau: float
            The update rate that target networks slowly track the learned networks.
    """
    def __init__(self,
                 actor_optimizer_spec,
                 critic_optimizer_spec,
                 num_feature,
                 num_action,
                 net_type,
                 replay_memory_size=1000000,
                 batch_size=64,
                 tau=0.001):
        ###############
        # BUILD MODEL #
        ###############
        self.num_feature = num_feature
        self.num_action = num_action
        self.batch_size = batch_size
        self.tau = tau
        # Construct actor and critic

        if net_type == 0:
            self.actor = MLPA(input_size=num_feature,
                              output_size=num_action,
                              hidden_size=(400, 300),
                              n_layers=2,
                              tanh_flag=1).type(dtype)
            self.target_actor = MLPA(input_size=num_feature,
                                     output_size=num_action,
                                     hidden_size=(400, 300),
                                     n_layers=2,
                                     tanh_flag=1).type(dtype)
            self.critic = MLPC(input_size_state=num_feature,
                               input_size_action=num_action,
                               output_size=1,
                               hidden_size=(400, 300),
                               n_layers=2).type(dtype)
            self.target_critic = MLPC(input_size_state=num_feature,
                                      input_size_action=num_action,
                                      output_size=1,
                                      hidden_size=(400, 300),
                                      n_layers=2).type(dtype)
        elif net_type == 1:
            self.actor = MLPA(input_size=num_feature + 1,
                              output_size=num_action,
                              hidden_size=(400, 300),
                              n_layers=2,
                              tanh_flag=1).type(dtype)
            self.target_actor = MLPA(input_size=num_feature + 1,
                                     output_size=num_action,
                                     hidden_size=(400, 300),
                                     n_layers=2,
                                     tanh_flag=1).type(dtype)
            self.critic = MLPC(input_size_state=num_feature + 1,
                               input_size_action=num_action,
                               output_size=1,
                               hidden_size=(400, 300),
                               n_layers=2).type(dtype)
            self.target_critic = MLPC(input_size_state=num_feature + 1,
                                      input_size_action=num_action,
                                      output_size=1,
                                      hidden_size=(400, 300),
                                      n_layers=2).type(dtype)
        elif net_type == 2:
            self.actor = PMLPA(input_size=num_feature,
                               output_size=num_action,
                               hidden_size=(400, 300),
                               dtype=dtype,
                               n_layers=2,
                               tanh_flag=1).type(dtype)
            self.target_actor = PMLPA(input_size=num_feature,
                                      output_size=num_action,
                                      hidden_size=(400, 300),
                                      dtype=dtype,
                                      n_layers=2,
                                      tanh_flag=1).type(dtype)
            self.critic = PMLPC(input_size_state=num_feature,
                                input_size_action=num_action,
                                output_size=1,
                                hidden_size=(400, 300),
                                dtype=dtype,
                                n_layers=2).type(dtype)
            self.target_critic = PMLPC(input_size_state=num_feature,
                                       input_size_action=num_action,
                                       output_size=1,
                                       hidden_size=(400, 300),
                                       dtype=dtype,
                                       n_layers=2).type(dtype)

        # Construct the optimizers for actor and critic
        self.actor_optimizer = actor_optimizer_spec.constructor(
            self.actor.parameters(), **actor_optimizer_spec.kwargs)
        self.critic_optimizer = critic_optimizer_spec.constructor(
            self.critic.parameters(), **critic_optimizer_spec.kwargs)
        # Construct the replay memory
        self.replay_memory = ReplayMemory(replay_memory_size)

    def copy_weights_for_finetune(self, weight_files):
        # hard coded for finetuning ...

        # copy actor
        for lin_layer, weight_file in zip(self.actor.control_hidden_list[0],
                                          weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.actor.l1.state_dict())

        for lin_layer, weight_file in zip(self.actor.control_hidden_list[1],
                                          weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.actor.l2.state_dict())

        for lin_layer, weight_file in zip(self.actor.control_h2o_list,
                                          weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.actor.h2o.state_dict())

        # copy critic
        for lin_layer, weight_file in zip(self.critic.control_hidden_list[0],
                                          weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.critic.l1.state_dict())

        for lin_layer, weight_file in zip(self.critic.control_hidden_list[1],
                                          weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.critic.l2.state_dict())

        for lin_layer, weight_files in zip(self.critic.control_h2o_list,
                                           weight_files):
            agent = torch.load(weight_file)
            lin_layer.load_state_dict(agent.critic.h2o.state_dict())

    def select_action(self, state, phase, net_type):
        state = torch.from_numpy(state).type(dtype).unsqueeze(0)
        phase = torch.from_numpy(np.array([phase])).type(dtype).unsqueeze(0)
        if net_type == 0:
            action = self.actor(Variable(state, volatile=True)).data.cpu()
        elif net_type == 1:
            action = self.actor(
                Variable(torch.cat((state, phase), 1),
                         volatile=True)).data.cpu()
        elif net_type == 2:
            action = self.actor(Variable(state, volatile=True),
                                Variable(phase, volatile=True)).data.cpu()

        return action

    def update(self, net_type, gamma=1.0):
        if len(self.replay_memory) < self.batch_size:
            return
        state_batch, action_batch, reward_batch, next_state_batch, phase_batch, next_phase_batch, done_mask = \
            self.replay_memory.sample(self.batch_size)
        state_batch = Variable(torch.from_numpy(state_batch).type(dtype))
        action_batch = Variable(torch.from_numpy(action_batch).type(dtype))
        reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype))
        next_state_batch = Variable(
            torch.from_numpy(next_state_batch).type(dtype))
        phase_batch = Variable(
            torch.from_numpy(phase_batch).type(dtype)).unsqueeze(1)
        next_phase_batch = Variable(
            torch.from_numpy(next_phase_batch).type(dtype)).unsqueeze(1)
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

        ### Critic ###
        self.critic_optimizer.zero_grad()
        if net_type == 0 or net_type == 1:
            if net_type == 0:
                # Compute current Q value, critic takes state and action choosen
                current_Q_values = self.critic(
                    torch.cat((state_batch, action_batch), 1))
                # Compute next Q value based on which action target actor would choose
                # Detach variable from the current graph since we don't want gradients for next Q to propagated
                #target_actions = self.target_actor(state_batch) # shouldn't it be next_state_batch
                target_actions = self.target_actor(next_state_batch)
                next_max_q = self.target_critic(
                    torch.cat((next_state_batch, target_actions),
                              1)).detach().max(1)[0]
            elif net_type == 1:
                # Compute current Q value, critic takes state and action choosen
                current_Q_values = self.critic(
                    torch.cat((state_batch, phase_batch, action_batch), 1))
                # Compute next Q value based on which action target actor would choose
                # Detach variable from the current graph since we don't want gradients for next Q to propagated
                target_actions = self.target_actor(
                    torch.cat((next_state_batch, next_phase_batch), 1))
                next_max_q = self.target_critic(
                    torch.cat(
                        (next_state_batch, next_phase_batch, target_actions),
                        1)).detach().max(1)[0]

            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = reward_batch + (gamma * next_Q_values)
            # Compute Bellman error (using Huber loss)
            critic_loss = F.mse_loss(current_Q_values, target_Q_values)
            # Optimize the critic
            critic_loss.backward()
            self.critic_optimizer.step()

        elif net_type == 2:
            current_Q_values = self.critic(
                torch.cat((state_batch, action_batch), 1), phase_batch)
            target_actions = self.target_actor(next_state_batch,
                                               next_phase_batch)
            next_max_q = self.target_critic(
                torch.cat((next_state_batch, target_actions), 1),
                next_phase_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            target_Q_values = reward_batch + (gamma * next_Q_values)
            critic_loss = F.mse_loss(current_Q_values, target_Q_values)
            critic_loss.backward()
            # Optimize the critic
            self.critic_optimizer.step()

        ### Actor ###
        self.actor_optimizer.zero_grad()
        if net_type == 0 or net_type == 1:
            if net_type == 0:
                actor_loss = -self.critic(
                    torch.cat(
                        (state_batch, self.actor(state_batch)), 1)).mean()
            elif net_type == 1:
                actor_loss = -self.critic(
                    torch.cat(
                        (state_batch, phase_batch,
                         self.actor(torch.cat(
                             (state_batch, phase_batch), 1))), 1)).mean()

            # Optimize the actor
            actor_loss.backward()
            self.actor_optimizer.step()

        elif net_type == 2:
            actor_loss = -self.critic(
                torch.cat((state_batch, self.actor(state_batch, phase_batch)),
                          1), phase_batch).mean()
            actor_loss.backward()

            # Optimize the actor
            self.actor_optimizer.step()

        # Update the target networks
        self.update_target(self.target_critic, self.critic)
        self.update_target(self.target_actor, self.actor)

    def update_target(self, target_model, model):
        for target_param, param in zip(target_model.parameters(),
                                       model.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)
Exemplo n.º 7
0
def train_model(env,
                conv_layers,
                learning_rate=5e-4,
                total_timesteps=100000,
                buffer_size=50000,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                train_freq=1,
                batch_size=32,
                print_freq=1,
                checkpoint_freq=100000,
                checkpoint_path=None,
                learning_starts=1000,
                gamma=1.0,
                target_network_update_freq=500,
                double_dqn=False,
                **network_kwargs) -> tf.keras.Model:
    """Train a DQN model.

    Parameters
    -------
    env: gym.Env
        openai gym
    conv_layers: list
        a list of triples that defines the conv network
    learning_rate: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to run the environment
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every train_freq steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to store a checkpoint during training
    checkpoint_path: str
        the fs path for storing the checkpoints
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    double_dqn: bool
        specifies if double q-learning is used during training
    Returns
    -------
    dqn: an instance of tf.Module that contains the trained model
    """
    q_func = build_dueling_q_func(conv_layers, **network_kwargs)

    dqn = DeepQ(model_builder=q_func,
                observation_shape=env.observation_space.shape,
                num_actions=env.action_space.n,
                learning_rate=learning_rate,
                gamma=gamma,
                double_dqn=double_dqn)

    manager = None
    if checkpoint_path is not None:
        load_path = osp.expanduser(checkpoint_path)
        ckpt = tf.train.Checkpoint(model=dqn.q_network)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Create the replay buffer
    replay_buffer = ReplayMemory(buffer_size)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(total_timesteps=int(exploration_fraction *
                                                     total_timesteps),
                                 initial_prob=1.0,
                                 final_prob=exploration_final_eps)

    dqn.update_target()

    episode_rewards = [0.0]
    obs = env.reset()

    obs = np.expand_dims(np.array(obs), axis=0)

    for t in range(total_timesteps):
        update_eps = exploration.step_to(t)

        action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps)
        action = action[0].numpy()

        new_obs, reward, done, _ = env.step(action)
        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        replay_buffer.add(obs[0], action, reward, new_obs[0], float(done))
        obs = new_obs

        episode_rewards[-1] += reward
        if done:
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, _ = tf.ones_like(rewards), None

            td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones,
                                weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network every target_network_update_freq steps
            dqn.update_target()

        reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1)
        number_episodes = len(episode_rewards) - 1
        if done and print_freq is not None and number_episodes % print_freq == 0:
            format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}"
            print(
                format_str.format(t, number_episodes, reward_100_mean,
                                  episode_rewards[-2],
                                  int(100 * exploration.value(t))))

            with train_summary_writer.as_default():
                tf.summary.scalar('loss',
                                  dqn.train_loss_metrics.result(),
                                  step=t)
                tf.summary.scalar('reward', episode_rewards[-2], step=t)

        if checkpoint_path is not None and t % checkpoint_freq == 0:
            manager.save()

        # Every training step, reset the loss metric
        dqn.train_loss_metrics.reset_states()

    return dqn.q_network
Exemplo n.º 8
0
class DDPG():
    """
    The Deep Deterministic Policy Gradient (DDPG) Agent
    Parameters
    ----------
        actor_optimizer_spec: OptimizerSpec
            Specifying the constructor and kwargs, as well as learning rate and other
            parameters for the optimizer
        critic_optimizer_spec: OptimizerSpec
        num_feature: int
            The number of features of the environmental state
        num_action: int
            The number of available actions that agent can choose from
        replay_memory_size: int
            How many memories to store in the replay memory.
        batch_size: int
            How many transitions to sample each time experience is replayed.
        tau: float
            The update rate that target networks slowly track the learned networks.
    """
    def __init__(self,
                 actor_optimizer_spec,
                 critic_optimizer_spec,
                 num_feature,
                 num_action,
                 replay_memory_size=1000000,
                 batch_size=64,
                 tau=0.001):
        ###############
        # BUILD MODEL #
        ###############
        self.num_feature = num_feature
        self.num_action = num_action
        self.batch_size = batch_size
        self.tau = tau
        # Construct actor and critic
        self.actor = Actor(num_feature, num_action).type(dtype)
        self.target_actor = Actor(num_feature, num_action).type(dtype)
        self.critic = Critic(num_feature, num_action).type(dtype)
        self.target_critic = Critic(num_feature, num_action).type(dtype)
        # Construct the optimizers for actor and critic
        self.actor_optimizer = actor_optimizer_spec.constructor(
            self.actor.parameters(), **actor_optimizer_spec.kwargs)
        self.critic_optimizer = critic_optimizer_spec.constructor(
            self.critic.parameters(), **critic_optimizer_spec.kwargs)
        # Construct the replay memory
        self.replay_memory = ReplayMemory(replay_memory_size)

    def select_action(self, state):
        state = torch.from_numpy(state).type(dtype).unsqueeze(0)
        action = self.actor(Variable(state, volatile=True)).data.cpu()[0, 0]
        return action

    def update(self, gamma=1.0):
        if len(self.replay_memory) < self.batch_size:
            return
        state_batch, action_batch, reward_batch, next_state_batch, done_mask = \
            self.replay_memory.sample(self.batch_size)
        state_batch = Variable(torch.from_numpy(state_batch).type(dtype))
        action_batch = Variable(
            torch.from_numpy(action_batch).type(dtype)).unsqueeze(1)
        reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype))
        next_state_batch = Variable(
            torch.from_numpy(next_state_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

        ### Critic ###
        # Compute current Q value, critic takes state and action choosen
        current_Q_values = self.critic(state_batch, action_batch)
        # Compute next Q value based on which action target actor would choose
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        target_Q_values = get_target_value_critic(self.target_critic,
                                                  self.target_actor,
                                                  next_state_batch)
        target_Q_values = torch.squeeze(target_Q_values)

        target_Q_values.data.mul_(gamma)
        # if done, using the reward as the target
        target_Q_values.data.mul_(not_done_mask.data)
        # target_next_state_value:shape [batch_size]
        target_Q_values.data.add_(reward_batch.data)

        # Compute Bellman error (using Huber loss)
        critic_loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
        # critic_loss = torch.mean(torch.pow(target_Q_values - current_Q_values, 2))
        # Optimize the critic
        self.critic.get_optimizer().zero_grad()
        critic_loss.backward()
        self.critic.get_optimizer().step()

        ### Actor ###
        actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
        # Optimize the actor
        self.actor.get_optimizer().zero_grad()
        actor_loss.backward()
        self.actor.get_optimizer().step()

        # Update the target networks
        self.target_actor.moving_average_update(self.actor.state_dict(),
                                                decay=1 - self.tau)
        self.target_critic.moving_average_update(self.critic.state_dict(),
                                                 decay=1 - self.tau)
Exemplo n.º 9
0
class DQN_Agent():
    '''
    Regular Q-Learning Agent
    One deep network.
    DQN - to predict Q of a given action, value a state. i.e. Q(s,a) and Q(s', a') for loss calculation.
    '''
    def __init__(
        self,
        state_size,
        n_actions,
        args,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        self.device = device

        # Exploration / Exploitation params.
        self.steps_done = 0
        self.eps_threshold = 1
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay

        # RL params
        self.target_update = args.target_update
        self.discount = args.discount

        # Env params
        self.n_actions = n_actions
        self.state_size = state_size

        # Deep q networks params
        self.layers = args.layers
        self.batch_size = args.batch_size
        self.policy_net = DQN(state_size, n_actions,
                              layers=self.layers).to(self.device).float()
        self.target_net = None
        self.grad_clip = args.grad_clip

        if str(args.optimizer).lower() == 'adam':
            self.optimizer = optim.Adam(self.policy_net.parameters())
        if str(args.optimizer).lower() == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policy_net.parameters())
        else:
            raise NotImplementedError

        self.memory = ReplayMemory(args.replay_size)

        # Performance buffers.
        self.rewards_list = []

    def add_to_memory(self, state, action, next_state, reward):
        self.rewards_list.append(reward)
        state = torch.from_numpy(state).float()
        action = torch.tensor([action])
        next_state = torch.from_numpy(next_state).float()
        reward = torch.tensor([reward])
        self.memory.push(state, action, next_state, reward)

    def select_action(self, state):
        sample = random.random()
        self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
                        math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if sample > self.eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                state = torch.from_numpy(state).float().to(
                    self.device)  # Convert to tensor.
                state = state.unsqueeze(0)  # Add batch dimension.
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]],
                                device=self.device,
                                dtype=torch.long).item()

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        next_states_batch = torch.cat(batch.next_state).view(
            self.batch_size, -1).to(self.device)
        state_batch = torch.cat(batch.state).view(self.batch_size,
                                                  -1).to(self.device)
        action_batch = torch.cat(batch.action).view(self.batch_size,
                                                    -1).to(self.device)
        reward_batch = torch.cat(batch.reward).view(self.batch_size,
                                                    -1).to(self.device)

        # Compute loss
        loss = self._compute_loss(state_batch, action_batch, next_states_batch,
                                  reward_batch)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # clip grad
        if self.grad_clip is not None:
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-self.grad_clip, self.grad_clip)

        # update Policy net weights
        self.optimizer.step()

        # update Target net weights
        self._update_target()

    def _compute_loss(self, state_batch, action_batch, next_states_batch,
                      reward_batch):
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states using the same policy net.
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values = self.policy_net(next_states_batch).max(
            1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values.unsqueeze(1) *
                                        self.discount) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)
        return loss

    def _update_target(self):
        if self.target_net is None:
            # There is nothing to update.
            return

        # Update the target network, copying all weights and biases in DQN
        if self.target_update > 1:
            # Hard copy of weights.
            if self.steps_done % self.target_update == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            return
        elif self.target_update < 1 and self.target_update > 0:
            # polyak averaging:
            tau = self.target_update
            for target_param, param in zip(self.target_net.parameters(),
                                           self.policy_net.parameters()):
                target_param.data.copy_(tau * param + (1 - tau) * target_param)
            return
        else:
            raise NotImplementedError

    def save_ckpt(self, ckpt_folder):
        '''
        saves checkpoint of policy net in ckpt_folder
        :param ckpt_folder: path to a folder.
        '''
        ckpt_path = os.path.join(ckpt_folder, 'policy_net_state_dict.pth')
        torch.save(self.policy_net.state_dict(), ckpt_path)
Exemplo n.º 10
0
class DDPG():
    """
    The Deep Deterministic Policy Gradient (DDPG) Agent
    Parameters
    ----------
        actor_optimizer_spec: OptimizerSpec
            Specifying the constructor and kwargs, as well as learning rate and other
            parameters for the optimizer
        critic_optimizer_spec: OptimizerSpec
        num_feature: int
            The number of features of the environmental state
        num_action: int
            The number of available actions that agent can choose from
        replay_memory_size: int
            How many memories to store in the replay memory.
        batch_size: int
            How many transitions to sample each time experience is replayed.
        tau: float
            The update rate that target networks slowly track the learned networks.
    """
    def __init__(self,
                 actor_optimizer_spec,
                 critic_optimizer_spec,
                 num_feature,
                 num_action,
                 replay_memory_size=1000000,
                 batch_size=64,
                 tau=0.001):
        ###############
        # BUILD MODEL #
        ###############
        self.num_feature = num_feature
        self.num_action = num_action
        self.batch_size = batch_size
        self.tau = tau
        # Construct actor and critic
        self.actor = Actor(num_feature, num_action).type(dtype)
        self.target_actor = Actor(num_feature, num_action).type(dtype)
        self.critic = Critic(num_feature, num_action).type(dtype)
        self.target_critic = Critic(num_feature, num_action).type(dtype)
        # Construct the optimizers for actor and critic
        self.actor_optimizer = actor_optimizer_spec.constructor(
            self.actor.parameters(), **actor_optimizer_spec.kwargs)
        self.critic_optimizer = critic_optimizer_spec.constructor(
            self.critic.parameters(), **critic_optimizer_spec.kwargs)
        # Construct the replay memory
        self.replay_memory = ReplayMemory(replay_memory_size)

    def select_action(self, state):
        state = torch.from_numpy(state).type(dtype).unsqueeze(0)
        action = self.actor(Variable(state,
                                     volatile=True)).data[0].cpu().numpy()
        #print(action)
        return action

    def update(self, gamma=1.0):
        if len(self.replay_memory) < self.batch_size:
            return
        state_batch, action_batch, reward_batch, next_state_batch, done_mask = \
            self.replay_memory.sample(self.batch_size)
        state_batch = Variable(torch.from_numpy(state_batch).type(dtype))
        action_batch = Variable(torch.from_numpy(action_batch).type(dtype))
        reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype))
        next_state_batch = Variable(
            torch.from_numpy(next_state_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

        ### Critic ###
        # Compute current Q value, critic takes state and action choosen
        #print(state_batch.data.size(),action_batch.data.size())
        current_Q_values = self.critic(state_batch, action_batch)
        # Compute next Q value based on which action target actor would choose
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        target_actions = self.target_actor(state_batch)
        next_max_q = self.target_critic(next_state_batch,
                                        target_actions).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = reward_batch + (gamma * next_Q_values)
        # Compute Bellman error (using Huber loss)
        critic_loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ### Actor ###
        actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean()
        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks
        self.update_target(self.target_critic, self.critic)
        self.update_target(self.target_actor, self.actor)

    def update_target(self, target_model, model):
        for target_param, param in zip(target_model.parameters(),
                                       model.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)