예제 #1
0
def main():
    env_spec = registry[env_name]
    env = gym.make(env_spec["id"])
    ep_max_steps = env_spec["max_episode_steps"]
    agent = DDPG(env.observation_space.shape, env.action_space.shape,
                 env.action_space.low[0], env.action_space.high[0])
    replay_buffer = ReplayBuffer()

    state = env.reset()
    done = False
    ep_timesteps = 0
    ep_reward = 0
    ep_num = 0
    reward_history = []

    for t in range(TOTAL_TIMESTEPS):
        ep_timesteps += 1

        # Select action
        if t < START_TIMESTEP:
            action = env.action_space.sample()
        else:
            action = agent.select_action(np.array(state))

        # Perform action
        next_state, reward, done, _ = env.step(action)
        train_done = done and ep_timesteps < ep_max_steps

        replay_buffer.add(
            TransitionTuple(state, action, next_state, reward,
                            int(train_done)))

        state = next_state
        ep_reward += reward

        if t >= START_TIMESTEP:
            agent.train(replay_buffer, BATCH_SIZE)

        if done:
            reward_history.append(ep_reward)
            print(
                f"[Episode {ep_num+1}, Timestep {t+1}] Total reward: {ep_reward}  Total timesteps: {ep_timesteps}"
            )
            state = env.reset()
            done = False
            ep_timesteps = 0
            ep_reward = 0
            ep_num += 1

        if RENDER:
            env.render()

    # Visualize results
    if OUTPUT_PLOT:
        sns.lineplot(x=np.arange(len(reward_history)) + 1, y=reward_history)
        plt.ylabel("Episode Reward")
        plt.xlabel("Episode Number")
        plt.savefig(OUTPUT_PLOT)
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > self.batch_size:
            if self.t_step % self.update_every == 0:
                for _ in range(self.n_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update epsilon noise value
        self.eps = max(self.eps_end, self.eps_decay*self.eps)
        # self.eps = self.eps - (1/self.eps_decay)
        # if self.eps < self.eps_end:
        #     self.eps = self.eps_end

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
예제 #3
0
파일: ddpg.py 프로젝트: namidairo777/DRL
def train(sess, env, actor, critic, noise, reward, discrete):
    # set up summary writer
    summary_write = tf.summary.FileWriter("ddpg_summary")

    sess.run(tf.global_variables_initializer())

    # initialize target and critic network
    actor.update_target_network()
    critic.update_target_network()

    # initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_reward = 0
        ep_ave_max_q = 0

        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):
            if RENDER_ENV:
                env.render()
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # Add exploration noise
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

            # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            s2, r, terminal, info = env.step(action)

            # Choose reward type
            ep_reward += r

            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)

            # Adding experience to memory
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targes
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
                ep_ave_max_q += np.max(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            # Set previous state for next step
            s = s2
            if terminal:
                # Reward system for episode
                #
                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay
                for step in episode_buffer:
                    replay_buffer.add(np.reshape(step[0], (actor.s_dim, )),
                                      np.reshape(step[1], (actor.a_dim, )),
                                      step[2], step[3],
                                      np.reshape(step[4], (actor.s_dim, )))

                # summary = tf.summary()
                # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward))
                # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j)))
                # summary_writer.add_summary(summary, i)

                # summary_writer.flush()

                if i != 0:
                    print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % (
                        int(ep_reward), i, (ep_ave_max_q / float(i)))
                break
예제 #4
0
class DuelingDQN(BaseAgent):
    def __init__(self, env):
        self.buffer_size = 20000
        self.batch_size = 64
        self.tau = 1
        self.gamma = 0.95
        self.learning_rate = 0.001

        # Exploration Parameters
        self.E_start = 1
        self.E_end = 0.1
        self.E_decay = 0.002
        self.episode = 0

        self.env = env
        self.os = self.env.observation_space
        #         self.acs = self.env.action_space

        self.edim = len(self.os.high)
        self.adim = self.env.action_space.n

        self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1)

        self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate)
        self.target = DuelingDQN_Model(self.edim, self.adim,
                                       self.learning_rate)

        self.initial_weights = self.local.model.get_weights()
        self.target.model.set_weights(self.initial_weights)

    def act(self, state, testing):
        state = state.reshape([1, -1])
        actionQs = self.local.model.predict(state)

        action = np.argmax(actionQs)

        epsilon = self.E_end + (self.E_start - self.E_end) * np.exp(
            -self.E_decay * self.episode)
        if (not testing):
            if (np.random.rand() < epsilon):
                action = np.random.choice(self.adim)

#         action = np.array([action])

        return action

    def learn(self, state, action, reward, next_state, done, testing):

        # Skip all learning during testing
        if (testing):
            return

        act_index = action

        self.buffer.add(state, act_index, reward, next_state, done)

        if (done):
            self.episode += 1

            # TODO When upgrading to RDPG this should be per-episode based

            states, actions, rewards, next_states, dones = self.buffer.batch(
                self.batch_size)

            actions.astype(int)
            actions = actions.reshape([-1])
            rewards = rewards.reshape([-1])
            dones = dones.reshape([-1])
            # Bellman equation
            target_Q = rewards + self.gamma * np.amax(
                self.target.model.predict_on_batch(next_states),
                axis=1) * (1 - dones)

            self.local.train([states, actions, target_Q])

            self.soft_update(self.target, self.local)

    def soft_update(self, target, local):
        local_weights = np.array(local.model.get_weights())
        target_weights = np.array(target.model.get_weights())

        new_target_weights = (
            1 - self.tau) * target_weights + self.tau * local_weights

        target.model.set_weights(new_target_weights)

    def reset(self):
        shuffle_weights(self.local.model, self.initial_weights)
        self.target.model.set_weights(self.local.model.get_weights())
        self.episode = 0
예제 #5
0
        s = env.reset()
        s_t = np.hstack((s[0], s[1], s[2]))
        while True:
            #epsilon *= 0.995
            loss = 0.0
            #epsilon -= 1.0/10000.0
            a = actor.model.predict(s_t.reshape(1,s_t.shape[0]))
            noise = Ornstein_Uhlenbeck(a[0])
            #noise = max(epsilon,0) * noise.function(a[0], 0.0, 0.15, 0.3) 
#            a = a[0] + noise
            a = a[0] + noise()[0]
            
            s2, r, done, info = env.step(np.array(a))
            s2_t = np.hstack((s2[0],s2[1],s2[2]))

            buff.add(s_t, a, r, s2_t, done)
            batch = buff.get_batch(batch_size)

            if len(batch) >= batch_size:
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                reward = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.asarray([e[1] for e in batch])
               
                target_q = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                
                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = reward[k]
예제 #6
0
파일: main.py 프로젝트: ezliu/SlimeAI
def challenger_round():
    challengers = []
    leaders = []
    leader_checkpoints = os.listdir(LEADER_DIR)
    # Need to share the same schedule with all challengers, so they all anneal
    # at same rate
    epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES)
    for i in xrange(NUM_LEADERS):
        challenger = try_gpu(
            DQNAgent(6,
                     epsilon_schedule,
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM))
        if i < len(leader_checkpoints):
            leader = try_gpu(
                DQNAgent(6, LinearSchedule(0.1, 0.1, 500000),
                         OBSERVATION_MODE))
            leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i])
            print "LOADING CHECKPOINT: {}".format(leader_path)
            challenger.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
            leader.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
        else:
            leader = RandomAgent(6)
            print "INITIALIZING NEW CHALLENGER AND LEADER"
        challengers.append(challenger)
        leaders.append(leader)

    if CHALLENGER_DIR is not None:
        challengers = []
        # Load in all of the leaders
        for checkpoint in os.listdir(CHALLENGER_DIR):
            path = os.path.join(CHALLENGER_DIR, checkpoint)
            print "LOADING FROM CHALLENGER_DIR: {}".format(path)
            challenger = try_gpu(
                DQNAgent(6,
                         LinearSchedule(0.05, 0.05, 1),
                         CHALLENGER_OBSERVATION_MODE,
                         lr=LR,
                         max_grad_norm=GRAD_CLIP_NORM,
                         name=checkpoint))
            challenger.load_state_dict(
                torch.load(path, map_location=lambda storage, loc: storage))
            challengers.append(challenger)

    challenger = EnsembleDQNAgent(challengers)
    leader = EnsembleDQNAgent(leaders)
    if OPPONENT is not None or HUMAN:
        leader = NoOpAgent()
    replay_buffer = ReplayBuffer(1000000)
    rewards = collections.deque(maxlen=1000)
    frames = 0  # number of training frames seen
    episodes = 0  # number of training episodes that have been played
    with tqdm(total=TRAIN_FRAMES) as progress:
        # Each loop completes a single episode
        while frames < TRAIN_FRAMES:
            states = env.reset()
            challenger.reset()
            leader.reset()
            episode_reward = 0.
            episode_frames = 0
            # Each loop completes a single step, duplicates _evaluate() to
            # update at the appropriate frame #s
            for _ in xrange(MAX_EPISODE_LENGTH):
                frames += 1
                episode_frames += 1
                action1 = challenger.act(states[0])
                action2 = leader.act(states[1])
                next_states, reward, done = env.step(action1, action2)
                episode_reward += reward

                # NOTE: state and next_state are LazyFrames and must be
                # converted to np.arrays
                replay_buffer.add(
                    Experience(states[0], action1._action_index, reward,
                               next_states[0], done))
                states = next_states

                if len(replay_buffer) > 50000 and \
                        frames % 4 == 0:
                    experiences = replay_buffer.sample(32)
                    challenger.update_from_experiences(experiences)

                if frames % 10000 == 0:
                    challenger.sync_target()

                if frames % SAVE_FREQ == 0:
                    # TODO: Don't access internals
                    for agent in challenger._agents:
                        path = os.path.join(LEADER_DIR,
                                            agent.name + "-{}".format(frames))
                        print "SAVING CHECKPOINT TO: {}".format(path)
                        torch.save(agent.state_dict(), path)
                    #path = os.path.join(
                    #        LEADER_DIR, challenger.name + "-{}".format(frames))
                    #torch.save(challenger.state_dict(), path)

                if frames >= TRAIN_FRAMES:
                    break

                if done:
                    break

            if episodes % 300 == 0:
                print "Evaluation: {}".format(
                    evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN))
            print "Episode reward: {}".format(episode_reward)
            episodes += 1
            rewards.append(episode_reward)
            stats = challenger.stats
            stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards)
            stats["Num Episodes"] = episodes
            stats["Replay Buffer Size"] = len(replay_buffer)
            progress.set_postfix(stats, refresh=False)
            progress.update(episode_frames)
            episode_frames = 0
예제 #7
0
class DDPG:
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

    def step(self, states, actions, rewards, next_states, dones):

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

        if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0):

            for i in range(self.config.EPOCH):
                experiences = self.memory.sample(self.config.BATCH_SIZE)
                self.learn(experiences)

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.critic_target(next_states,
                                            self.actor_target(next_states))
        Q_targets = rewards + (self.config.GAMMA * Q_targets_next *
                               (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic_local(states,
                                        self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #8
0
class Agent(object):
    """DDPG Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)

    def reset(self):
        """Reset state of agent."""
        self.noise.reset()

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        checkpoint = torch.load(path)
        self.actor_local.load_state_dict(checkpoint['actor_local'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic_local.load_state_dict(checkpoint['critic_local'])
        self.critic_target.load_state_dict(checkpoint['critic_target'])

    def act(self, state, add_noise=True):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state

        Returns:
            action (np.ndarray): Action tuple
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().detach().numpy()

        # Resume training mode
        self.actor_local.train()

        # Add noise if exploring
        if add_noise:
            action += self.noise.sample()
            # The noise might take us out of range
            action = np.clip(action, -1, 1)

        return action

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.
        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn as soon as we have enough stored experiences
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Learn from batch of experiences."""
        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)

        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))

        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)
        self.critic_optimizer.step()
        # endregion

        # region Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion

        # Update target networks
        soft_update(self.critic_local, self.critic_target, TAU)
        soft_update(self.actor_local, self.actor_target, TAU)
예제 #9
0
class Agent(object):
    """DQN Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 replay_buffer_size=int(1e5),
                 batch_size=64,
                 discount_factor=0.99,
                 soft_update=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 **kwargs):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            replay_buffer_size (int): Size of replay buffer
            batch_size (int): Size of experience batches during training
            discount_factor (float): Discount factor (gamma)
            soft_update (float): Soft update coefficient (tau)
            learning_rate (float): Learning rate (alpha)
            update_every (int): Steps between updating the network
            **kwargs: Arguments describing the QNetwork
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        # Parameters
        self.batch_size = batch_size
        """Size of experience batches during training"""

        self.discount_factor = discount_factor
        """Discount factor (gamma)"""

        self.soft_update = soft_update
        """Soft update coefficient (tau)"""

        self.update_every = update_every
        """Steps between updating the network"""

        # Q Networks
        self.target_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Target Q-Network"""

        self.local_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Local Q-Network"""

        self.optimizer = optim.Adam(self.local_network.parameters(),
                                    lr=learning_rate)
        """Optimizer used when training the Q-network."""

        # Memory
        self.memory = ReplayBuffer(replay_buffer_size, batch_size, device)

        # Time step
        self.t_step = 0
        """Current time step"""

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        self.local_network.save_weights(path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        self.local_network.load_weights(path)

    def act(self, state, eps=0.):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state
            eps (float): Probability of selecting random action (epsilon)
            
        Returns:
            int: Epsilon-greedily selected action
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.local_network.eval()
        with torch.no_grad():
            action_values = self.local_network(state)
        self.local_network.train()

        # Select action epsilon-greedily
        if random.random() > eps:
            return np.argmax(action_values.cpu().detach().numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.

        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn if at update_every steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # Check that we have enough stored experiences
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Update Q-network using given experiences

        Args:
            experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
                SARS'+done tuple
        """
        states, actions, rewards, next_states, dones = experiences

        # Predicted Q values from target model for next states
        # (NB. torch.max returns tuple (max, argmax)
        q_target_next = self.target_network(next_states).max(dim=1,
                                                             keepdim=True)[0]

        # Computed target Q values for current state
        q_target = rewards + self.discount_factor * q_target_next * (1 - dones)

        # Predicted Q values from local model for current state
        q_local = self.local_network(states).gather(dim=1, index=actions)

        loss = F.mse_loss(q_local, q_target)

        # Update local network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        soft_update(self.local_network, self.target_network, self.soft_update)
예제 #10
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.2  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Save reward
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            # Keeping track of the score
            self.score = self.total_reward / float(
                self.count) if self.count else 0.0
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)