예제 #1
0
class Agent_DQN:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4
        self.num_actions = self.env.action_space.n

        # if testing, simply load the model we have trained
        if args.test_dqn:
            self.load(args.model)
            self.online_net.eval()
            self.target_net.eval()
            return
        # DQN variants setting
        self.prioritized = args.prioritized
        self.double = args.double
        self.n_steps = args.n_steps
        self.noise_linear = args.noise_linear
        if self.prioritized:
            self.memory = PrioritizedReplayBuffer(10000, alpha=0.6)
            self.beta_schedule = LinearSchedule(args.num_timesteps,
                                                initial_p=0.4,
                                                final_p=1.0)

            self.criterion = MSELoss
        else:
            self.memory = ReplayBuffer(10000)
            self.criterion = nn.MSELoss()

        if args.atari:
            DQN = DQN_Atari
            input_feature = self.input_channels
        else:
            DQN = DQN_Simple
            input_feature = env.observation_space.shape[0]

        # build target, online network
        self.target_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.target_net = self.target_net.cuda(
        ) if use_cuda else self.target_net
        self.online_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.online_net = self.online_net.cuda(
        ) if use_cuda else self.online_net

        # discounted reward
        self.GAMMA = 0.99

        # exploration setting
        self.exploration = LinearSchedule(schedule_timesteps=int(
            0.1 * args.num_timesteps),
                                          initial_p=1.0,
                                          final_p=0.05)

        # training settings
        self.train_freq = 4
        self.learning_start = 10000
        self.batch_size = args.batch_size
        self.num_timesteps = args.num_timesteps
        self.display_freq = args.display_freq
        self.save_freq = args.save_freq
        self.target_update_freq = args.target_update_freq
        self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4)
        # global status
        self.episodes_done = 0
        self.steps = 0

    def make_action(self, observation, test=True):
        return self.act(observation, test)

    def save(self, save_path):
        print('save model to', save_path)
        torch.save(self.online_net, save_path + '_online')
        torch.save(self.target_net, save_path + '_target')

    def load(self, load_path):
        if use_cuda:
            self.online_net = torch.load(load_path + '_online')
            self.target_net = torch.load(load_path + '_target')
        else:
            self.online_net = torch.load(
                load_path + '_online',
                map_location=lambda storage, loc: storage)
            self.target_net = torch.load(
                load_path + '_target',
                map_location=lambda storage, loc: storage)

    def act(self, state, test=False):
        sample = random.random()
        if test:
            eps_threshold = 0.01
            state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0)
            state = state.cuda() if use_cuda else state
        else:
            eps_threshold = self.exploration.value(self.steps)

        if sample > eps_threshold:
            action = self.online_net(
                Variable(state,
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        else:
            action = LongTensor([[random.randrange(self.num_actions)]])
        return action if not test else action[0, 0]

    def reset_noise(self):
        assert self.noise_linear == True
        self.online_net.reset_noise()
        self.target_net.reset_noise()

    def update(self):
        if self.prioritized:
            batch, weight, batch_idxes = self.memory.sample(
                self.batch_size, beta=self.beta_schedule.value(self.steps))
            weight_batch = Variable(Tensor(weight)).squeeze()
        else:
            batch = self.memory.sample(self.batch_size)
        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # We don't want to backprop through the expected action values and volatile
        # will save us on temporarily changing the model parameters'
        # requires_grad to False!
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.online_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = Variable(torch.zeros(self.batch_size).type(Tensor))
        q_next = self.target_net(non_final_next_states)
        if self.double:
            _, best_actions = self.online_net(non_final_next_states).max(1)
            next_state_values[non_final_mask] = q_next.gather(
                1, best_actions.unsqueeze(1)).squeeze(1)
        else:
            next_state_values[non_final_mask] = q_next.max(1)[0]

        # Now, we don't want to mess up the loss with a volatile flag, so let's
        # clear it. After this, we'll just end up with a Variable that has
        # requires_grad=False
        next_state_values.volatile = False
        # Compute the expected Q values
        expected_state_action_values = (
            next_state_values * (self.GAMMA**(self.n_steps))) + reward_batch

        # Compute loss
        if self.prioritized:
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)
            loss = torch.mul(loss, weight_batch)
            new_priorities = np.abs(loss.cpu().data.numpy()) + 1e-6
            self.memory.update_priorities(batch_idxes, new_priorities)
            loss = loss.mean()
        else:
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.data[0]

    def process_state(self, state):
        state = np.array(state)
        if self.args.atari:
            # map shape: (84,84,4) --> (1,4,84,84)
            state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0)
        else:
            state = torch.Tensor(state).unsqueeze(0)
        return state.cuda() if use_cuda else state

    def train(self):
        total_reward = 0
        loss = 0
        # set training mode
        self.online_net.train()
        while (True):
            if self.noise_linear:
                self.reset_noise()

            state = self.process_state(self.env.reset())
            done = False
            episode_duration = 0
            while (not done):
                # select and perform action
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                total_reward += reward
                reward = Tensor([reward])

                # process new state
                next_state = self.process_state(next_state)
                if done:
                    next_state = None

                # store the transition in memory
                self.memory.push(state, action, next_state, reward)

                # move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                if self.steps > self.learning_start and self.steps % self.train_freq == 0:
                    loss = self.update()
                    if self.noise_linear:
                        self.reset_noise()

                # update target network
                if self.steps > self.learning_start and self.steps % self.target_update_freq == 0:
                    self.target_net.load_state_dict(
                        self.online_net.state_dict())

                if self.steps % self.save_freq == 0:
                    self.save('dqn.cpt')

                self.steps += 1
                episode_duration += 1

            if self.episodes_done % self.display_freq == 0:
                print(
                    'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d'
                    % (self.episodes_done, self.steps, self.num_timesteps,
                       self.exploration.value(self.steps), total_reward /
                       self.display_freq, loss, episode_duration))
                writer.add_scalar('reward', total_reward / self.display_freq,
                                  self.steps)
                total_reward = 0

            self.episodes_done += 1
            if self.steps > self.num_timesteps:
                break
        self.save('dqn_final.model')

    def nsteps_train(self):
        '''
        Training procedure for multi-steps learning
        '''
        total_reward = 0
        loss = 0
        # set training mode
        self.online_net.train()
        while (True):
            if self.noise_linear:
                self.reset_noise()
            state_buffer = deque()  # store states for future use
            action_buffer = deque()  # store actions for future use
            reward_buffer = deque()  # store rewards for future use
            nstep_reward = 0  # calculate n-step discounted reward

            state = self.process_state(self.env.reset())
            state_buffer.append(state)

            done = False
            episode_duration = 0

            # run n-1 steps
            for _ in range(1, self.n_steps):
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                next_state = self.process_state(next_state)
                if done:
                    next_state = None
                state_buffer.append(next_state)
                action_buffer.append(action)
                nstep_reward = nstep_reward * self.GAMMA + reward
                reward_buffer.append(reward)

                state = next_state
                episode_duration += 1

            while (not done):
                # select and perform action
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                total_reward += reward

                # process new state
                next_state = self.process_state(next_state)
                if done:
                    next_state = None

                # save new state, action, reward
                state_buffer.append(next_state)
                action_buffer.append(action)
                reward_buffer.append(reward)
                nstep_reward = nstep_reward * self.GAMMA + reward

                # store the transition in memory
                self.memory.push(state_buffer.popleft(),
                                 action_buffer.popleft(), next_state,
                                 Tensor([nstep_reward]))

                # update n-step reward
                nstep_reward -= (self.GAMMA**(self.n_steps -
                                              1)) * reward_buffer.popleft()

                # move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                if self.steps > self.learning_start and self.steps % self.train_freq == 0:
                    loss = self.update()
                    if self.noise_linear:
                        self.reset_noise()

                # update target network
                if self.steps > self.learning_start and self.steps % self.target_update_freq == 0:
                    self.target_net.load_state_dict(
                        self.online_net.state_dict())

                if self.steps % self.save_freq == 0:
                    self.save('dqn.cpt')

                self.steps += 1
                episode_duration += 1

            if self.episodes_done % self.display_freq == 0:
                print(
                    'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d'
                    % (self.episodes_done, self.steps, self.num_timesteps,
                       self.exploration.value(self.steps), total_reward /
                       self.display_freq, loss, episode_duration))
                writer.add_scalar('reward', total_reward / self.display_freq,
                                  self.steps)
                total_reward = 0

            self.episodes_done += 1
            if self.steps > self.num_timesteps:
                break
        self.save('dqn_final.model')
예제 #2
0
class DynaQAgent(mp.Process):
    def __init__(self,
                 id,
                 env,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 global_network,
                 target_network,
                 q,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DynaQAgent, self).__init__()
        self.id = id
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma

        self.q = q

        self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

        self.global_network = global_network
        self.target_network = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.scores_window = deque(maxlen=100)  # last 100 scores

    def act(self, state, eps=0.):
        if random.random() > eps:
            # Turn the state into a tensor
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            with torch.no_grad():
                action_values = self.global_network(
                    state)  # Make choice based on local network

            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.local_memory.add(state, action, reward, next_state, done)

        # Increment local timer
        self.t_step += 1

        if self.t_step > BATCH_SIZE:
            experiences = self.local_memory.sample(BATCH_SIZE)
            self.learn(experiences)

            # TODO: Better way to do this??
            if self.q[0].empty() and np.mean(self.scores_window) < 180:
                experiences = self.local_memory.sample(BATCH_SIZE)
                self.q[0].put(experiences[0].detach().share_memory_())
                self.q[1].put(experiences[1].detach().share_memory_())
                self.q[2].put(experiences[2].detach().share_memory_())
                self.q[3].put(experiences[3].detach().share_memory_())
                self.q[4].put(experiences[4].detach().share_memory_())

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target_network(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.global_network(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.global_network, self.target_network, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def get_experience_as_tensor(self, e):
        states = torch.from_numpy(np.vstack([e.state])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state
                                                  ])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done]).astype(
            np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def get_action_values(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        with torch.no_grad():
            action_values = self.target_network(state)

        return action_values.cpu().data.numpy()[0]

    def get_delta(self, state, action, next_state, reward):
        priority = reward + self.gamma * np.max(
            self.get_action_values(next_state)) - self.get_action_values(
                state)[action]
        return priority

    def run(self):
        scores = []

        eps = self.eps_start  # initialize epsilon
        start_time = time.time()
        for i_episode in range(1, self.n_episodes + 1):
            state = self.env.reset()
            score = 0

            for t in range(self.max_t):
                action = self.act(state, eps)
                # if do_render:
                #     self.env.render()
                next_state, reward, done, _ = self.env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            self.scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(self.eps_end, self.eps_decay * eps)  # decrease epsilon
            elapsed_time = time.time() - start_time
            if self.id == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(self.scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if i_episode % 100 == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(self.scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if np.mean(self.scores_window) >= 200.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(self.scores_window)))
                break
예제 #3
0
class DQNAgent(mp.Process):
    def __init__(self,
                 id,
                 env,
                 do_render,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 update_every,
                 global_network,
                 target_network,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DQNAgent, self).__init__()
        self.id = id
        self.env = env
        self.do_render = do_render
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma
        self.update_every = update_every

        self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.global_network = global_network
        self.qnetwork_target = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

    def act(self, state, eps=0.):
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            with torch.no_grad():
                action_values = self.global_network(state)

            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.local_memory.add(state, action, reward, next_state, done)

        # Increment local timer
        self.t_step += 1

        # If enough samples are available in memory, get random subset and learn
        # Learn every UPDATE_EVERY time steps.
        if self.t_step % self.update_every == 0:
            if self.t_step > BATCH_SIZE:
                experiences = self.local_memory.sample(BATCH_SIZE)
                self.learn(experiences)

    def compute_loss(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target.forward(
            next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        # Q_expected = self.qnetwork_local(states).gather(1, actions)
        Q_expected = self.global_network.forward(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def learn(self, experiences):

        loss = self.compute_loss(experiences)

        # Update gradients per HogWild! algorithm
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.global_network, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def run(self):
        scores = []
        scores_window = deque(maxlen=100)  # last 100 scores
        eps = self.eps_start  # initialize epsilon
        start_time = time.time()
        for i_episode in range(1, self.n_episodes + 1):
            state = self.env.reset()
            score = 0
            for t in range(self.max_t):
                action = self.act(state, eps)
                if self.do_render:
                    self.env.render()
                next_state, reward, done, _ = self.env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(self.eps_end, self.eps_decay * eps)  # decrease epsilon
            elapsed_time = time.time() - start_time
            if self.id == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if i_episode % 100 == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if np.mean(scores_window) >= 200.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.global_network.state_dict(), 'checkpoint.pth')
                break