class TestCritic(unittest.TestCase):
    def setUp(self):
        self.state_dim = (2, 80, 80)

        self.critic = Critic()

    def test_forward(self):
        n = 2
        batch = torch.tensor(np.random.random_sample((n, ) + self.state_dim),
                             dtype=torch.float)

        values = self.critic.forward(batch)
        self.assertEqual((n, 1), values.size())
Exemplo n.º 2
0
class DDPG:
    def __init__(self, state_dim, action_dim):
        self.critic = Critic(state_dim, action_dim).to(device)
        self.target_c = copy.deepcopy(self.critic)

        self.actor = Actor(state_dim).to(device)
        self.target_a = copy.deepcopy(self.actor)

        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=LR)
        self.optimizer_a = optim.Adam(self.actor.parameters(), lr=LR)

    def act(self, state):
        state = torch.from_numpy(np.array(state)).float().to(device)
        return self.actor.forward(state).detach().squeeze(0).cpu().numpy()

    def update(self, batch):
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.from_numpy(np.array(states)).float().to(device)
        actions = torch.from_numpy(np.array(actions)).float().to(device)
        rewards = torch.from_numpy(
            np.array(rewards)).float().to(device).unsqueeze(1)
        next_states = torch.from_numpy(
            np.array(next_states)).float().to(device)
        dones = torch.from_numpy(np.array(dones)).to(device)

        Q_current = self.critic(states, actions)
        Q_next = self.target_c(next_states,
                               self.target_a(next_states).detach())
        y = (rewards + GAMMA * Q_next).detach()

        ##################Update critic#######################
        loss_c = F.mse_loss(y, Q_current)
        self.optimizer_c.zero_grad()
        loss_c.backward()
        self.optimizer_c.step()

        ##################Update actor#######################
        loss_a = -self.critic.forward(states, self.actor(states)).mean()
        self.optimizer_a.zero_grad()
        loss_a.backward()
        self.optimizer_a.step()

        ##################Update targets#######################
        for target_pr, pr in zip(self.target_a.parameters(),
                                 self.actor.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)

        for target_pr, pr in zip(self.target_c.parameters(),
                                 self.critic.parameters()):
            target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)
class TestCritic(unittest.TestCase):
    def setUp(self):
        self.state_dim = 24 * 2
        self.action_dim = 2 * 2

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim,
                             fc1_units=64,
                             fc2_units=64,
                             seed=0)

    def test_forward(self):
        n = 2
        states = torch.tensor(np.random.random_sample((n, self.state_dim)),
                              dtype=torch.float)
        actions = torch.tensor(np.random.random_sample((n, self.action_dim)),
                               dtype=torch.float)

        values = self.critic.forward(states, actions)
        self.assertEqual((n, 1), values.size())
Exemplo n.º 4
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if len(envs.observation_space.shape) == 3:
        actor_critic = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if args.cuda:
        actor_critic.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
        critic_optim = optim.Adam(critic.parameters(), lr=1e-4)
        gamma = 0.99
        tau = 0.001

    #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
    mem_buffer = ReplayBuffer()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)
            mem_buffer.add((pre_state, current_obs,
                            action_log_prob.data.cpu().numpy(), reward, done))
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        action, action_log_prob, states = actor_critic.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if True:
            state, next_state, action, reward, done = mem_buffer.sample(5)
            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, 6])
            next_q_values = critic_target(
                to_tensor(next_state, volatile=True),
                target_actor(to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True))[0])
            next_q_values.volatile = False
            target_q_batch = to_tensor(reward) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values
            critic.zero_grad()
            q_batch = critic(to_tensor(state), to_tensor(action))
            value_loss = criterion(q_batch, target_q_batch)
            value_loss.backward()
            critic_optim.step()
            actor_critic.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor_critic(to_tensor(state), to_tensor(state),
                             to_tensor(state))[0])
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            optimizer.step()
            soft_update(target_actor, actor_critic, tau)
            soft_update(critic_target, critic, tau)
        '''
        if args.algo in ['a2c', 'acktr']:
            action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))
            values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            #advantages = Variable(rollouts.returns[:-1]) - values
            advantages = rollouts.returns[:-1] - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages) * action_log_probs).mean()
            #action_loss = -(Variable(advantages.data) * action_log_probs).mean()


            optimizer.zero_grad()
            critic_optim.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
            critic_optim.step()
        '''
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Exemplo n.º 5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            num agents (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        ####self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.forward(next_states)
        Q_targets_next = self.critic_target.forward(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local.forward(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local.forward(states)

        actor_loss = -self.critic_local.forward(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 6
0
class Agent():
    def __init__(self, state_size, action_size, num_agents, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.seed = random.seed(seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        #Actor Network
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic Network
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        action = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()  # set module to evaluation mode
        with torch.no_grad():
            for agent_idx, state_ in enumerate(state):
                action[agent_idx, :] = self.actor_local.forward(
                    state_).cpu().data.numpy()
        self.actor_local.train()  # reset it back to training mode

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)  # restrict the output boundary -1, 1

    def reset(self):
        self.noise.reset()

    def step(self, state, action, reward, next_state, done, timeStep):
        """Save experience in replay memory, and use random sample from buffer to updateWeight_local."""
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])
        if len(self.memory) > BATCH_SIZE and timeStep % 2 == 0:
            self.updateWeight_local(self.memory.sample(), GAMMA)

    def updateWeight_local(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
           actor_target(state) -> action
           critic_target(state, action) -> Q-value

        Params
        ======
           experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
           gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_actions = self.actor_target(next_states)  # Sarsa?
        Q_target_next = self.critic_target.forward(next_states, next_actions)
        Q_target = rewards + gamma * Q_target_next * (1 - dones)
        Q_local = self.critic_local.forward(states, actions)
        critic_loss = F.mse_loss(Q_local, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local.forward(states)
        actor_loss = -self.critic_local(
            states,
            actions_pred).mean()  # '-' for Reward Maxim, gradient ascent
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.updateWeight_target(self.critic_local, self.critic_target, TAU)
        self.updateWeight_target(self.actor_local, self.actor_target, TAU)

    def updateWeight_target(self, local_model, target_model, tau):
        """Soft update TARGET model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 7
0
class DDPG:
    def __init__(self, args):
        """
            init function
            Args:
                - args: class with args parameter
        """
        self.state_size = args.state_size
        self.action_size = args.action_size
        self.bs = args.bs
        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.tau = args.tau
        self.discrete = args.discrete
        self.randomer = OUNoise(args.action_size)
        self.buffer = ReplayBuffer(args.max_buff)

        self.actor = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)
        self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor)

        self.critic = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

    def reset(self):
        """
            reset noise and model
        """
        self.randomer.reset()

    def get_action(self, state):
        """
            get distribution of action
            Args:
                - state: list, shape == [state_size]
        """
        state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
        action = self.actor(state).detach()
        action = action.squeeze(0).numpy()
        action += self.epsilon * self.randomer.noise()
        action = np.clip(action, -1.0, 1.0)
        return action

    def learning(self):
        """
            learn models
        """
        s1, a1, r1, t1, s2 = self.buffer.sample_batch(self.bs)
        # bool -> int
        t1 = 1 - t1
        s1 = torch.tensor(s1, dtype=torch.float)
        a1 = torch.tensor(a1, dtype=torch.float)
        r1 = torch.tensor(r1, dtype=torch.float)
        t1 = torch.tensor(t1, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)

        a2 = self.actor_target(s2).detach()
        q2 = self.critic_target(s2, a2).detach()
        q2_plus_r = r1[:, None] + t1[:, None] * self.gamma * q2
        q1 = self.critic.forward(s1, a1)

        # critic gradient
        critic_loss = nn.MSELoss()
        loss_critic = critic_loss(q1, q2_plus_r)
        self.critic_opt.zero_grad()
        loss_critic.backward()
        self.critic_opt.step()

        # actor gradient
        pred_a = self.actor.forward(s1)
        loss_actor = (-self.critic.forward(s1, pred_a)).mean()
        self.actor_opt.zero_grad()
        loss_actor.backward()
        self.actor_opt.step()

        # Notice that we only have gradient updates for actor and critic, not target
        # actor_opt.step() and critic_opt.step()
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return loss_actor.item(), loss_critic.item()
Exemplo n.º 8
0
class DDPG:
    """Interacts with and learns from the environment.
    There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions.
    Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action).
    The critic for each agents gets to see the actions and observations of all agents. """
    def __init__(self, state_size, action_size, num_agents):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state for each agent
            action_size (int): dimension of each action for each agent
        """
        self.state_size = state_size
        self.action_size = action_size

        self.actor_local = Actor(state_size, action_size).to(DEVICE)
        self.actor_target = Actor(state_size, action_size).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size).to(DEVICE)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        self.noise_scale = NOISE_START

        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""

        if self.noise_scale > NOISE_END:
            self.noise_scale *= NOISE_REDUCTION

        if not add_noise:
            self.noise_scale = 0.0

        states = torch.from_numpy(states).float().to(DEVICE)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        actions += self.noise_scale * self.noise()

        return np.clip(actions, -1, 1)

    def noise(self):
        return 0.5 * np.random.randn(1, self.action_size)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        full_states, actor_full_actions, full_actions, agent_rewards, \
            agent_dones, full_next_states, critic_full_next_actions = experiences

        # ---------------------------- update critic ---------------------------- #
        Q_target_next = self.critic_target(full_next_states,
                                           critic_full_next_actions)
        Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones)
        Q_expected = self.critic_local(full_states, full_actions)
        critic_loss = F.mse_loss(input=Q_expected, target=Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        actor_loss = -self.critic_local.forward(full_states,
                                                actor_full_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    @staticmethod
    def hard_update(target, source):
        for target_param, source_param in zip(target.parameters(),
                                              source.parameters()):
            target_param.data.copy_(source_param.data)
Exemplo n.º 9
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    ### for the number of processes to use
    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)
    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    ## ALE Environments : mostly has Discrete action_space type
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    ### shape==3 for ALE Environments : States are 3D (Image Pi)
    if len(envs.observation_space.shape) == 3:
        actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy,
                      envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
        baseline_target = Baseline_Critic(in_channels=4,
                                          num_actions=envs.action_space.n)

    if args.cuda:
        actor.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()
        baseline_target.cuda()

    actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr)
    baseline_optim = optim.Adam(actor.parameters(), lr=1e-4)
    tau_soft_update = 0.001

    mem_buffer = ReplayBuffer()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    for j in range(num_updates):

        temperature = 1.0

        ## num_steps = 5 as in A2C
        for step in range(args.num_steps):
            temperature = temperature / (step + 1)
            # Sample actions
            action, action_log_prob, states, dist_entropy = actor.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True), temperature,
                envs.action_space.n, args.num_processes)

            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, dist_entropy.data,
                            value.data, reward, masks)

        nth_step_return = rollouts.returns[0].cpu().numpy()
        current_state = rollouts.observations[0].cpu().numpy()
        nth_state = rollouts.observations[-1].cpu().numpy()
        current_action = rollouts.action_log_probs[0].cpu().numpy()
        current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy()

        mem_buffer.add((current_state, nth_state, current_action,
                        nth_step_return, done, current_action_dist_entropy))
        action, action_log_prob, states, dist_entropy = actor.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True), temperature,
            envs.action_space.n, args.num_processes)  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        bs_size = args.batch_size
        if len(mem_buffer.storage) >= bs_size:
            ##samples from the replay buffer
            state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample(
                bs_size)

            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, envs.action_space.n])

            #current Q estimate
            q_batch = critic(to_tensor(state), to_tensor(action))
            # target Q estimate
            next_state_action_probs = target_actor(
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True))

            next_q_values = critic_target(to_tensor(next_state, volatile=True),
                                          next_state_action_probs[1])
            next_q_values.volatile = False
            target_q_batch = to_tensor(returns) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values

            critic.zero_grad()
            value_loss = criterion(q_batch, target_q_batch)

            if args.gradient_penalty == True:
                gradients = torch.autograd.grad(value_loss,
                                                critic.parameters(),
                                                allow_unused=True,
                                                retain_graph=True,
                                                create_graph=True,
                                                only_inputs=True)[0]
                gradient_penalty = ((gradients.norm(2, dim=1) - 1)**
                                    2).mean() * args.lambda_grad_penalty
                gradient_penalty.backward()

            else:
                value_loss = criterion(q_batch, target_q_batch)
                value_loss.backward()

            critic_optim.step()

            actor.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor(to_tensor(state), to_tensor(state), to_tensor(state))[0])

            ### Soft trust region constraint for the actor
            current_action_probs = actor(to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False))[0]
            target_action_probs = target_actor(to_tensor(state, volatile=True),
                                               to_tensor(state, volatile=True),
                                               to_tensor(state,
                                                         volatile=True))[0]

            policy_regularizer = criterion(current_action_probs,
                                           target_action_probs)

            ## Actor update with entropy penalty
            policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \
                            + args.actor_kl_lambda * policy_regularizer

            if args.actor_several_updates == True:
                for p in range(args.actor_updates):
                    policy_loss.backward(retain_variables=True)
            else:
                policy_loss.backward()

            ##clipping of gradient norms
            gradient_norms = nn.utils.clip_grad_norm(actor.parameters(),
                                                     args.max_grad_norm)
            print("gradient_norms", gradient_norms)
            actor_optim.step()

            if args.second_order_grads == True:
                """
                Training the Baseline critic (f(s, \mu(s)))
                """
                baseline_target.zero_grad()
                ## f(s, \mu(s))
                current_baseline = baseline_target(
                    to_tensor(state),
                    actor(to_tensor(state), to_tensor(state),
                          to_tensor(state))[0])

                ## \grad f(s,a)
                grad_baseline_params = torch.autograd.grad(
                    current_baseline.mean(),
                    actor.parameters(),
                    retain_graph=True,
                    create_graph=True)

                ## MSE : (Q - f)^{2}
                baseline_loss = (q_batch.detach() -
                                 current_baseline).pow(2).mean()
                # baseline_loss.volatile=True

                actor.zero_grad()
                baseline_target.zero_grad()
                grad_norm = 0
                for grad_1, grad_2 in zip(grad_params, grad_baseline_params):
                    grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum()
                grad_norm = grad_norm.sqrt()

                ##Loss for the Baseline approximator (f)
                overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm
                overall_loss.backward()
                baseline_optim.step()

            soft_update(target_actor, actor, tau_soft_update)
            soft_update(critic_target, critic, tau_soft_update)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and len(
                mem_buffer.storage) >= bs_size:
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor
            if args.cuda:
                save_model = copy.deepcopy(actor).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0],
                        entropy_log_prob.mean()))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            all_value_loss = [value_loss.data.cpu().numpy()[0]]
            all_policy_loss = [policy_loss.data.cpu().numpy()[0]]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss)
            # # logger.save()

        if args.vis and j % args.vis_interval == 0:

            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Exemplo n.º 10
0
class Agent():
    def __init__(self,
                 action_space_shape,
                 observation_space_shape,
                 n_train_steps=50 * 1000000,
                 replay_memory_size=1000000,
                 k=3):

        # Cuda
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Hyperparameters - dynamic
        self.action_space_shape = action_space_shape
        self.observation_space_shape = observation_space_shape
        self.k = k
        self.observation_input_shape = multiply_tuple(
            self.observation_space_shape, self.k)
        self.n_train_steps = n_train_steps
        self.replay_memory_size = replay_memory_size
        self.replay_memory = deque(maxlen=self.replay_memory_size)

        # Hyperparameters - static
        self.training_start_time_step = 1000  # Minimum: k * minibatch_size == 3 * 64 = 192
        self.gamma = 0.99  # For reward discount
        self.tau = 0.001  # For soft update

        # Hyperparameters - Ornstein_Uhlenbeck_noise
        self.theta = 0.15
        self.sigma = 0.2
        self.Ornstein_Uhlenbeck_noise = OUNoise(
            action_space_shape=self.action_space_shape,
            theta=self.theta,
            sigma=self.sigma)

        # Hyperparameters - NN model
        self.minibatch_size = 64  # For training NN
        self.lr_actor = 10e-4
        self.lr_critic = 10e-3
        self.weight_decay_critic = 10e-2

        # Parameters - etc
        self.action = None
        self.time_step = 0
        self.train_step = 0
        self.train_complete = False

        # Modules
        self.actor = Actor(
            action_space_shape=self.action_space_shape,
            observation_space_shape=self.observation_input_shape).to(
                self.device)
        self.critic = Critic(
            action_space_shape=self.action_space_shape,
            observation_space_shape=self.observation_input_shape).to(
                self.device)
        self.actor_hat = copy.deepcopy(self.actor)
        self.critic_hat = copy.deepcopy(self.critic)

        self.optimizer_actor = optim.Adam(self.actor.parameters(),
                                          lr=self.lr_actor)
        self.optimizer_critic = optim.Adam(
            self.critic.parameters(),
            lr=self.lr_critic,
            weight_decay=self.weight_decay_critic)

        # Operations
        self.mode('train')

    def reset(self, observation):

        self.previous_observation = torch.tensor([observation] * self.k).to(
            dtype=torch.float,
            device=self.device).view(self.observation_input_shape)
        self.observation_buffer = list()
        self.reward = torch.tensor([0])  # Tensor form for compatibility
        self.Ornstein_Uhlenbeck_noise.reset()

        # Since replay memory is somewhat full, we can decrease waiting time for sufficient data to fill in the replay memory.
        self.training_start_time_step = max(
            0, self.training_start_time_step - self.time_step)
        self.time_step = 0
        # Don't reset replay_memory
        # self.replay_memory = deque(maxlen = self.replay_memory_size)

    def mode(self, mode):

        self.mode = mode
        if self.mode == 'train':
            pass
        elif self.mode == 'test':
            pass
        else:
            assert False, 'mode not specified'

    def wakeup(self):

        # Frame skipping
        # See & Select actions every kth frame. Modify ations every kth frame
        # Otherwise, skip frame
        if self.time_step % self.k == 0:
            return True
        else:
            return False

    def act(self):

        if self.wakeup() == True:
            self.action = self.actor.forward(
                self.previous_observation) + torch.as_tensor(
                    self.Ornstein_Uhlenbeck_noise(),
                    dtype=torch.float,
                    device=self.device)

        self.time_step += 1

        # Return numpy version
        return self.action.detach().numpy()

    def observe(self, observation, reward):

        if self.wakeup() == True:

            # Append observation
            self.observation_buffer.append(observation)
            self.new_observation = torch.tensor(self.observation_buffer).to(
                dtype=torch.float,
                device=self.device).view(self.observation_input_shape)

            # Add reward
            self.reward += reward

            # Store transition in replay memory
            # If memory size exceeds, the oldest memory is popped (deque property)
            # wrap self.action with torch.tensor() to reset requires_grad = False
            self.replay_memory.append(
                (self.previous_observation, self.action.clone().detach(),
                 self.reward, self.new_observation
                 ))  # self.action.new_tensor() == self.action.clone().detach()

            # The new observation will be the previous observation next time
            self.previous_observation = self.new_observation

            # Empty observation buffer, reset reward
            self.observation_buffer = list()
            self.reward = torch.tensor([0])  # Tensor form for compatibility

        else:

            self.observation_buffer.append(observation)
            self.reward += reward

    def random_sample_data(self):

        memory_size = len(self.replay_memory)

        # state, action, reward, state_next
        s_i = list()
        a_i = list()
        r_i = list()
        s_i_1 = list()

        # Random Sample transitions, append them into np arrays
        random_index = np.random.choice(
            memory_size, size=self.minibatch_size, replace=False
        )  # random_index: [0,5,4,9, ...] // "replace = False" makes the indices exclusive.

        for index in random_index:

            # Random sample transitions, 'minibatch' times
            s, a, r, s_1 = self.replay_memory[index]
            s_i.append(
                s
            )  # s_i Equivalent to [self.replay_memory[index][0] for index in random_index]
            a_i.append(a)
            r_i.append(r)
            s_i_1.append(s_1)

        s_i = torch.stack(s_i).to(dtype=torch.float, device=self.device)
        a_i = torch.stack(a_i).to(dtype=torch.float, device=self.device)
        r_i = torch.stack(r_i).to(dtype=torch.float, device=self.device)
        s_i_1 = torch.stack(s_i_1).to(dtype=torch.float, device=self.device)

        return s_i, a_i, r_i, s_i_1

    def train(self):

        if self.wakeup(
        ) == True and self.time_step >= self.training_start_time_step:
            # 1. Sample random minibatch of transitions from replay memory
            # state, action, reward, state_next
            s_i, a_i, r_i, s_i_1 = self.random_sample_data(
            )  # **minibatch info included in "self"

            # 2. Set y_i
            y_i = r_i + self.gamma * self.critic_hat.forward(
                s_i_1, self.actor_hat.forward(s_i_1))

            # 3. Calculate Loss
            self.optimizer_critic.zero_grad()
            critic_loss = F.mse_loss(y_i, self.critic.forward(s_i, a_i))

            # 4. Update Critic
            critic_loss.backward()
            self.optimizer_critic.step()

            # 5. Update Actor
            self.optimizer_actor.zero_grad()
            critic_Q_mean = -self.critic.forward(
                s_i, self.actor.forward(s_i)).mean()
            critic_Q_mean.backward()
            self.optimizer_actor.step()

            # 6. Update target networks
            self.critic_hat = self.tau * self.critic + (
                1 - self.tau) * self.critic_hat
            self.actor = self.tau * self.actor + (1 -
                                                  self.tau) * self.actor_hat

            # 7. Increment train step.
            # If train step meets its scheduled training steps, change "train_complete" status
            self.train_step += 1
            if self.train_step >= self.n_train_steps:
                self.train_complete = True
Exemplo n.º 11
0
class Agent():
    def __init__(self, state_size, action_size, num_agents, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.seed = random.seed(seed)
        self.state_size = state_size  # 24
        self.action_size = action_size  # 2
        self.num_agents = num_agents  # 2
        self.eps = eps_start

        #Actor Network: State -> Action
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic Network: State1 x State2 x Action1 x Action2 ... -> Qvalue
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, seed).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, add_noise):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()  # set module to evaluation mode
        with torch.no_grad():
            action = self.actor_local.forward(state).cpu().data.numpy()
        self.actor_local.train()  # reset it back to training mode

        if add_noise:
            action += self.noise.sample() * self.eps

        return np.clip(action, -1, 1)  # restrict the output boundary -1, 1

    def reset(self):
        self.noise.reset()

    def step(self, state, action, reward, next_state, done, timestep,
             agent_index):
        """Save experience in replay memory, and use random sample from buffer to updateWeight_local."""
        # for i in range(self.num_agents):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_FREQUENCY == 0:
            self.updateWeight_local(agent_index, self.memory.sample(), GAMMA)

    def updateWeight_local(self, agent_index, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
           actor_target(state) -> action
           critic_target(state, action) -> Q-value

        Params
        ======
           experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
           gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # states: (batchsize, 24x2)
        # actions: (batchsize, 2x2)
        # rewards: (batchsize, 1x2)
        # next_states: (batchsize, 24x2)
        # dones: (batchsize, 1x2)
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        self_next_actions = self.actor_target(
            next_states[:, self.state_size * agent_index:self.state_size *
                        (agent_index + 1)])  # actor by self obser
        notSelf_actions = actions[:, self.action_size *
                                  (1 - agent_index):self.action_size *
                                  (2 - agent_index)]  # competitor's actions
        if agent_index == 0:  # concat order by agent index
            next_acitons = torch.cat((self_next_actions, notSelf_actions),
                                     dim=1).to(device)  # index0-> self:first
        else:
            next_acitons = torch.cat((notSelf_actions, self_next_actions),
                                     dim=1).to(device)  # index1 -> self:second

        Q_target_next = self.critic_target.forward(
            next_states,
            next_acitons)  # critic by both agent's obs and actions
        Q_target = rewards + gamma * Q_target_next * (1 - dones)
        Q_local = self.critic_local.forward(states, actions)
        critic_loss = F.mse_loss(Q_local, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        self_actions_pred = self.actor_local.forward(
            states[:, self.state_size * agent_index:self.state_size *
                   (agent_index + 1)])  #actor by self agent's obser
        notSelf_actions = actions[:, self.action_size *
                                  (1 - agent_index):self.action_size *
                                  (2 - agent_index)]  # competitor's actions
        if agent_index == 0:
            actions_pred = torch.cat((self_actions_pred, notSelf_actions),
                                     dim=1).to(device)
        else:
            actions_pred = torch.cat((notSelf_actions, self_actions_pred),
                                     dim=1).to(device)

        actor_loss = -self.critic_local(
            states,
            actions_pred).mean()  # '-' for Reward Maxim, gradient ascent
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.updateWeight_target(self.critic_local, self.critic_target, TAU)
        self.updateWeight_target(self.actor_local, self.actor_target, TAU)

        # Update epsilon noise value
        self.eps = self.eps - (1 / eps_decay)
        if self.eps < eps_end:
            self.eps = eps_end

    def updateWeight_target(self, local_model, target_model, tau):
        """Soft update TARGET model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPGAgent:

    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        self.noise = OUNoise(self.env.action_space)

    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch, next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # update critic
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()
        self.critic_optimizer.step()

        # update actor
        policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
Exemplo n.º 13
0
class Agent(object):
    """
    Interacts with and learns from the environment.
    """

    def __init__(self, state_space, hidden_size, action_size, num_agents,
                 seed=0, buffer_size=int(1e6),
                 actor_lr=1e-4, actor_hidden_sizes=(128, 256), actor_weight_decay=0,
                 critic_lr=1e-4, critic_hidden_sizes=(128, 256, 128), critic_weight_decay=0,
                 batch_size=128, gamma=0.99, tau=1e-3):
        """
        Initialize an Agent object.

        Params
        ======
            state_space (tuple): dimension of each states
            hidden_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents to train
            seed (int): random seed, default value is 0
            buffer_size (int): buffer size of experience memory, default value is 100000

            actor_lr (float): learning rate of actor model, default value is 1e-4
            actor_lr (float): learning rate of actor model, default value is 1e-4
            actor_hidden_sizes (tuple): size of hidden layer of actor model, default value is (128, 256)
            critic_lr (float): learning rate of critic model, default value is 1e-4
            critic_hidden_sizes (tuple): size of hidden layer of critic model, default value is (128, 256, 128)

            batch_size (int): mini-batch size
            gamma (float): discount factor
            tau (float): interpolation parameter
        """
        self.state_space = state_space
        self.hidden_size = hidden_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = seed

        self.batch_size = batch_size  # mini-batch size
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        # Actor Network
        self.actor_local = Actor(state_space, hidden_size, action_size, seed,
                                 hidden_units=actor_hidden_sizes).to(DEVICE)
        self.actor_target = Actor(state_space, hidden_size, action_size, seed,
                                  hidden_units=actor_hidden_sizes).to(DEVICE)
        self.actor_target.eval()
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=actor_lr,
                                          weight_decay=actor_weight_decay)

        # Critic Network
        self.critic_local = Critic(state_space, hidden_size, action_size, seed,
                                   hidden_units=critic_hidden_sizes).to(DEVICE)
        self.critic_target = Critic(state_space, hidden_size, action_size, seed,
                                    hidden_units=critic_hidden_sizes).to(DEVICE)
        self.critic_target.eval()
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=critic_lr,
                                           weight_decay=critic_weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplyBuffer(buffer_size=buffer_size, seed=seed)

        # copy parameters of the local model to the target model
        self.soft_update(self.critic_local, self.critic_target, 1.)
        self.soft_update(self.actor_local, self.actor_target, 1.)

        self.seed = random.seed(seed)
        np.random.seed(seed)

        self.reset()

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        state = np.asarray([state])

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1., 1.)

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use random sample from buffer to learn.
        """

        # Save experience / reward
        #  for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(batch_size=self.batch_size)
            self.learn(experiences, self.gamma)

    def learn(self, experiences, gamma):
        """
        Update policy and experiences parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-experiences

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        actions, rewards, dones = torch.from_numpy(actions).float().to(DEVICE), \
                                  torch.from_numpy(rewards).float().to(DEVICE), \
                                  torch.from_numpy(dones).to(DEVICE)

        # ------- update critic ------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))
        q_targets = q_targets.detach()

        # Compute critic loss
        q_expected = self.critic_local(states, actions)
        assert q_expected.shape == q_targets.shape
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)  # clip the gradient (Udacity)
        self.critic_optimizer.step()

        # ------- update actor ------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local.forward(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #  update target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        return actor_loss.item(), critic_loss.item()

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.detach_()
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save(self):
        """
        Save model state
        """
        torch.save(self.actor_local.state_dict(), "checkpoints/checkpoint_actor.pth")
        torch.save(self.actor_target.state_dict(), "checkpoints/checkpoint_actor_target.pth")

        torch.save(self.critic_local.state_dict(), "checkpoints/checkpoint_critic.pth")
        torch.save(self.critic_target.state_dict(), "checkpoints/checkpoint_critic_target.pth")

    def load(self):
        """
        Load model state
        """
        if not os.path.exists("checkpoints/checkpoint_actor.pth") or \
                not os.path.exists("checkpoints/checkpoint_actor_target.pth") or \
                not os.path.exists("checkpoints/checkpoint_critic.pth") or \
                not os.path.exists("checkpoints/checkpoint_critic_target.pth"):
            return

        self.actor_local.load_state_dict(torch.load("checkpoints/checkpoint_actor.pth"), strict=False)
        self.actor_target.load_state_dict(torch.load("checkpoints/checkpoint_actor_target.pth"), strict=False)

        self.critic_local.load_state_dict(torch.load("checkpoints/checkpoint_critic.pth"), strict=False)
        self.critic_target.load_state_dict(torch.load("checkpoints/checkpoint_critic_target.pth"), strict=False)

    def __str__(self):
        return f"{str(self.actor_local)}\n{str(self.critic_local)}"
Exemplo n.º 14
0
class MADDPG:
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device,
                 random_seed=4,
                 lr_critic=1.0e-4,
                 weight_decay=0.0):
        super(MADDPG, self).__init__()

        # parameter configuration
        self.num_agents = num_agents
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents
        self.global_action_size = global_action_size
        self.global_obs_dim = global_obs_dim
        torch.manual_seed(random_seed)
        random.seed(random_seed)
        self.random_seed = random_seed
        self.weight_decay = weight_decay

        # define actors
        self.actors = [
            DDPGActor(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=device) for _ in range(num_agents)
        ]
        # define centralized critic
        self.critic = Critic(global_obs_dim, global_action_size,
                             self.random_seed).to(self.device)
        self.target_critic = Critic(global_obs_dim, global_action_size,
                                    self.random_seed).to(self.device)
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)

        # noise coef
        self.noise_coef = 1.0
        self.noise_coef_decay = 1e-6

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def act(self, obs_all_agents):
        actions = [
            ddpg_actor.act(local_obs, self.noise_coef)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def target_act(self, obs_all_agents):
        actions = [
            ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full,
             dones, timestep):
        self.memory.add(obs, obs_full, actions, rewards, next_obs,
                        next_obs_full, dones)

        timestep = timestep % TRAIN_EVERY

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep == 0:
            for _ in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.discount_factor)

    def learn(self, experiences, gamma):
        obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences

        obs = obs.permute(1, 0, -1)  # agent_id * batch_size * state_size
        obs_full = obs_full.view(-1, self.global_obs_dim)
        next_obs = next_obs.permute(1, 0, -1)
        next_obs_full = next_obs_full.view(-1, self.global_obs_dim)
        action = action.reshape(-1, self.global_action_size)

        # ---------------- update centralized critic ----------------------- #
        self.critic_optimizer.zero_grad()

        # get target actions from all target_actors
        target_actions = np.array(self.target_act(next_obs))
        target_actions = torch.from_numpy(target_actions).float().permute(
            1, 0, -1)
        target_actions = target_actions.reshape(-1, self.global_action_size)

        # update critic
        with torch.no_grad():
            q_next = self.target_critic.forward(next_obs_full,
                                                target_actions.to(self.device))

        y = reward + gamma * q_next * (1 - done)

        q = self.critic.forward(obs_full, action)

        critic_loss = 0
        for i in range(self.num_agents):
            critic_loss += F.mse_loss(q, y[:, i].detach().reshape(
                -1, 1)) / self.num_agents
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------- update actor for all agents --------------------- #
        for ii in range(len(self.actors)):
            self.actors[ii].actor_optimizer.zero_grad()

            q_action = [ self.actors[i].actor_local(ob) if i == ii \
                   else self.actors[i].actor_local(ob).detach()
                   for i, ob in enumerate(obs) ]

            q_action = torch.stack(q_action).permute(1, 0, -1)
            q_action = q_action.reshape(-1, self.global_action_size).to(
                self.device)

            # policy_gradient
            actor_loss = -self.critic.forward(obs_full, q_action).mean()
            actor_loss.backward()
            self.actors[ii].actor_optimizer.step()

        # --------------- soft update all target networks ------------------- #
        soft_update(self.target_critic, self.critic, self.tau)
        for actor in self.actors:
            actor.update_target(self.tau)

        # -------------- reset noise --------------------------------------- #
        for actor in self.actors:
            actor.action_noise.reset()

        self.noise_coef -= self.noise_coef_decay
        if self.noise_coef < 0.01:
            self.noise_coef = 0.01
Exemplo n.º 15
0
class DDPG(object):
    """Interacts with and learns from the environment.
    There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions.
    Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action).
    The critic for each agents gets to see the actions and observations of all agents. """
    def __init__(self, state_size, action_size, num_agents):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state for each agent
            action_size (int): dimension of each action for each agent
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(DEVICE)
        self.actor_target = Actor(state_size, action_size).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size).to(DEVICE)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_critic)

        # Noise process
        self.noise = OUNoise(action_size)  #single agent only
        self.noise_scale = NOISE_START

        # Make sure target is initialized with the same weight as the source (makes a big difference)
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def act(self, states, i_episode, add_noise=True):
        """Returns actions for given state as per current policy."""

        if i_episode > EPISODES_BEFORE_TRAINING and self.noise_scale > NOISE_END:
            #self.noise_scale *= NOISE_REDUCTION
            self.noise_scale = NOISE_REDUCTION**(i_episode -
                                                 EPISODES_BEFORE_TRAINING)
        #else keep the previous value

        if not add_noise:
            self.noise_scale = 0.0

        states = torch.from_numpy(states).float().to(DEVICE)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        #add noise
        actions += self.noise_scale * self.add_noise2(
        )  #works much better than OU Noise process
        #actions += self.noise_scale*self.noise.sample()

        return np.clip(actions, -1, 1)

    def add_noise2(self):
        noise = 0.5 * np.random.randn(
            1, self.action_size
        )  #sigma of 0.5 as sigma of 1 will have alot of actions just clipped
        return noise

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        #for MADDPG
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        full_states, actor_full_actions, full_actions, agent_rewards, agent_dones, full_next_states, critic_full_next_actions = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get Q values from target models
        Q_target_next = self.critic_target(full_next_states,
                                           critic_full_next_actions)
        # Compute Q targets for current states (y_i)
        Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones)
        # Compute critic loss
        Q_expected = self.critic_local(full_states, full_actions)
        critic_loss = F.mse_loss(
            input=Q_expected, target=Q_target
        )  #target=Q_targets.detach() #not necessary to detach
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1.0) #clip the gradient for the critic network (Udacity hint)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local.forward(
            full_states, actor_full_actions).mean(
            )  #-ve b'cse we want to do gradient ascent
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def soft_update_all(self):
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, source_param in zip(target.parameters(),
                                              source.parameters()):
            target_param.data.copy_(source_param.data)
Exemplo n.º 16
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, num_agents=20):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        print("Running on: " + str(device))

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.eps = EPS_START
        self.eps_decay = 0.0005
        # Actor network
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),
                                      lr=LR_ACTOR)

        # Critic network
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(),
                                       lr=LR_CRITIC)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        self.noise = OUNoise((num_agents, action_size), seed)

    def step(self, state, action, reward, next_state, done, agent_id):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        self.t_step += 1
        # Learn every UPDATE_EVERY time steps.
        if (self.t_step % UPDATE_EVERY) == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for _ in range(LEARN_NUM):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, agent_id)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))

        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def learn(self, experiences, gamma, agent_id):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ------------------- update critic network ------------------- #
        target_actions = self.actor_target.forward(next_states)
        # Construct next actions vector relative to the agent
        if agent_id == 0:
            target_actions = torch.cat((target_actions, actions[:, 2:]), dim=1)
        else:
            target_actions = torch.cat((actions[:, :2], target_actions), dim=1)

        next_critic_value = self.critic_target.forward(next_states,
                                                       target_actions)
        critic_value = self.critic_local.forward(states, actions)
        # Q targets for current state
        # If the episode is over, the reward from the future state will not be incorporated
        Q_targets = rewards + (gamma * next_critic_value * (1 - dones))

        critic_loss = F.mse_loss(critic_value, Q_targets)
        # Minimizing loss
        self.critic_local.train()
        self.critic_optim.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optim.step()

        self.critic_local.eval()

        # ------------------- update actor network ------------------- #
        self.actor_local.train()
        self.actor_optim.zero_grad()
        mu = self.actor_local.forward(states)
        # Construct mu vector relative to each agent
        if agent_id == 0:
            mu = torch.cat((mu, actions[:, 2:]), dim=1)
        else:
            mu = torch.cat((actions[:, :2], mu), dim=1)

        actor_loss = -self.critic_local(states, mu).mean()
        actor_loss.backward()
        self.actor_optim.step()

        self.actor_local.eval()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
Exemplo n.º 17
0
Arquivo: ddpg.py Projeto: YuanyeMa/RL
class DDPGAgent:
    def __init__(self,
                 plot=True,
                 seed=1,
                 env: gym.Env = None,
                 batch_size=128,
                 learning_rate_actor=0.001,
                 learning_rate_critic=0.001,
                 weight_decay=0.01,
                 gamma=0.999):

        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.batch_size = batch_size
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.weight_decay = weight_decay
        self.gamma = gamma
        self.tau = 0.001

        self._to_tensor = util.to_tensor
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.actor = Actor(self.state_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.state_dim,
                                  self.action_dim).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate_actor,
                                                weight_decay=self.weight_decay)

        self.critic = Critic(self.state_dim, self.action_dim).to(self.device)
        self.target_critic = Critic(self.state_dim,
                                    self.action_dim).to(self.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(),
            self.learning_rate_critic,
            weight_decay=self.weight_decay)

        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        self.t = 0

    def _learn_from_memory(self, memory):
        ''' 从记忆学习,更新两个网络的参数
        '''
        # 随机获取记忆里的Transition
        trans_pieces = memory.sample(self.batch_size)
        s0 = np.vstack([x.state for x in trans_pieces])
        a0 = np.vstack([x.action for x in trans_pieces])
        r1 = np.vstack([x.reward for x in trans_pieces])
        s1 = np.vstack([x.next_state for x in trans_pieces])
        terminal_batch = np.vstack([x.is_done for x in trans_pieces])

        # 优化评论家网络参数
        s1 = self._to_tensor(s1, device=self.device)
        s0 = self._to_tensor(s0, device=self.device)

        next_q_values = self.target_critic.forward(
            state=s1, action=self.target_actor.forward(s1)).detach()
        target_q_batch = self._to_tensor(r1, device=self.device) + \
            self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values
        q_batch = self.critic.forward(s0,
                                      self._to_tensor(a0, device=self.device))

        # 计算critic的loss 更新critic网络参数
        loss_critic = F.mse_loss(q_batch, target_q_batch)
        #self.critic_optimizer.zero_grad()
        self.critic.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 反向传播,以某状态的价值估计为策略目标函数
        loss_actor = -self.critic.forward(s0, self.actor.forward(s0))  # Q的梯度上升
        loss_actor = loss_actor.mean()
        self.actor.zero_grad()
        #self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning(self, memory):
        self.actor.train()
        return self._learn_from_memory(memory)

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print('Models loaded successfully')
Exemplo n.º 18
0
class DDPG(object):
    """
    Interacts with and learns from the environment.

    There are two agents and the observations of each agent has 24 dimensions, while each agent's action has 2 dimensions.
    Here we use two separate actor networks (one for each agent using each agent's observations only and output that agent's action).
    The critic for each agents gets to see the full observations and full actions of all agents.
    """
    def __init__(self,
                 agent_id,
                 state_size,
                 full_state_size,
                 action_size,
                 full_action_size,
                 actor_hidden_sizes=(256, 128),
                 actor_lr=1e-4,
                 actor_weight_decay=0.,
                 critic_hidden_sizes=(256, 128),
                 critic_lr=1e-3,
                 critic_weight_decay=0.,
                 is_action_continuous=True):
        """
        Initialize an Agent object.

        :param agent_id (int): ID of each each agent.
        :param state_size (int): Dimension of each state for each agent.
        :param full_state_size (int): Dimension of full state for all agents.
        :param action_size (int): Dimension of each action for each agent.
        :param full_action_size: Dimension of full action for all agents.
        :param actor_hidden_sizes (tuple): Hidden units of the actor network.
        :param actor_lr (float): Learning rate of the actor network.
        :param actor_weight_decay (float): weight decay (L2 penalty) of the actor network.
        :param critic_hidden_sizes (tuple): Hidden units of the critic network.
        :param critic_lr (float): Learning rate of the critic network.
        :param critic_weight_decay (float): weight decay (L2 penalty) of the critic network.
        :param is_action_continuous (bool): Whether action space is continuous or discrete.
        """
        self.id = agent_id
        self.state_size = state_size
        self.full_state_size = full_state_size
        self.action_size = action_size
        self.full_action_size = full_action_size
        self.is_action_continuous = is_action_continuous

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size,
            actor_hidden_sizes,
            action_size,
            out_gate=nn.Tanh if is_action_continuous else None)
        self.actor_target = Actor(
            state_size,
            actor_hidden_sizes,
            action_size,
            out_gate=nn.Tanh if is_action_continuous else None)
        self.update(self.actor_local, self.actor_target, 1.)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=actor_lr,
                                          weight_decay=actor_weight_decay)

        # Critic Network (w/ Target Network)
        num_agents = int(full_action_size / action_size)
        self.critic_local = Critic(
            full_state_size,
            full_action_size if is_action_continuous else num_agents,
            critic_hidden_sizes)
        self.critic_target = Critic(
            full_state_size,
            full_action_size if is_action_continuous else num_agents,
            critic_hidden_sizes)
        # self.critic_local, self.critic_target = get_critic(full_state_size, full_action_size, critic_hidden_sizes)
        self.update(self.critic_local, self.critic_target, 1.)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=critic_lr,
                                           weight_decay=critic_weight_decay)

        self.use_actor = True

        # Noise Process
        self.noise_scale = 0.
        self.noise = OUNoise(action_size)

    def reset(self):
        self.noise.reset()

    def act(self, state, noise_scale=0.0):
        """
        Returns action for given state using current policy.
        """
        states = torch.from_numpy(state[np.newaxis]).float()

        # calculate actions
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states)
        self.actor_local.train()
        actions = actions.cpu().numpy().squeeze()

        # add noise
        actions += noise_scale * self.noise.sample()

        return np.clip(actions, -1,
                       1) if self.is_action_continuous else np.argmax(actions)

    def learn(self,
              states,
              actions,
              rewards,
              next_states,
              dones,
              full_actions_predicted,
              critic_full_next_actions,
              gamma=0.99):
        """
        Update policy and value parameters.
        Q_targets = r + γ * critic_target(next_state, action_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        :param states: Full states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE)
        :param actions: Full actions for training which size is (BATCHES, NUM_AGENTS, ACTION_SIZE)
        :param rewards: Full rewards for training which size is (BATCHES, NUM_AGENTS)
        :param next_states: Full next states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE)
        :param dones: Full dones for training which size is (BATCHES, NUM_AGENTS)
        :param full_actions_predicted:
        :param critic_full_next_actions: Full next states which size is (BATCHES, NUM_AGENTS * STATE_SIZE)
        :param gamma: discount ratio
        """
        full_states = states.view(-1, self.full_state_size)
        full_actions = actions.view(states.shape[0], -1).float()
        full_next_states = next_states.view(-1, self.full_state_size)
        critic_full_next_actions = torch.cat(critic_full_next_actions,
                                             dim=1).float().to(DEVICE)

        actor_rewards = rewards[:, self.id].view(-1, 1)
        actor_dones = dones[:, self.id].view(-1, 1)

        # ---------------------------- update critic ---------------------------- #
        q_next = self.critic_target.forward(full_next_states,
                                            critic_full_next_actions)

        q_target = actor_rewards + gamma * q_next * (1 - actor_dones)

        q_expected = self.critic_local(full_states, full_actions)

        # Compute critic loss
        critic_loss = F.mse_loss(q_expected, q_target.detach())

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        if self.use_actor:
            # detach actions from other agents
            full_actions_predicted = [
                actions if i == self.id else actions.detach()
                for i, actions in enumerate(full_actions_predicted)
            ]
            full_actions_predicted = torch.cat(full_actions_predicted,
                                               dim=1).float().to(DEVICE)

            # Compute actor loss
            actor_loss = -self.critic_local.forward(
                full_states, full_actions_predicted).mean()

            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
        else:
            actor_loss = torch.tensor(0)

        return actor_loss.cpu().item(), critic_loss.cpu().item()

    def update(self, source, target, tau=0.01):
        """
        Update target model parameters:
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param source: Pytorch model which parameters are copied from
        :param target: Pytorch model which parameters are copied to
        :param tau: interpolation parameter
        """
        for param, target_param in zip(source.parameters(),
                                       target.parameters()):
            target_param.data.copy_(target_param.data * (1 - tau) +
                                    param.data * tau)