Exemplo n.º 1
0
class MultiAgent(object):
    def __init__(self, config: DefaultMunch):
        self.config = config
        self.memory = self.config.memory
        self.n_agents = self.config.n_agents
        self.action_size = self.config.action_size
        self.state_size = self.config.state_size
        self.critic_local = Critic(self.state_size, self.config.action_size,
                                   self.config.n_agents).to(self.config.device)
        self.critic_target = Critic(self.state_size, self.config.action_size,
                                    self.config.n_agents).to(
                                        self.config.device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.config.lr_critic)
        self.agents = [Agent(self.config, self) for i in range(self.n_agents)]

    def step(self, states, actions, rewards, next_states, dones):
        self.memory.add((states[0], actions[0], rewards[0], next_states[0],
                         dones[0], states[1], actions[1], next_states[1]))
        self.agents[0].step()
        self.memory.add((states[1], actions[1], rewards[1], next_states[1],
                         dones[1], states[0], actions[0], next_states[0]))
        self.agents[1].step()

    def act(self, states, add_noise=True):
        actions1: torch.Tensor = self.agents[0].act(states[0], add_noise)
        actions2: torch.Tensor = self.agents[1].act(states[1], add_noise)
        actions = torch.stack([actions1, actions2], dim=0)
        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def save(self, path, episode):
        for i, agent in enumerate(self.agents):
            agent.save(path + str(i), episode)

    def load(self, path):
        for i, agent in enumerate(self.agents):
            agent.load(path + str(i))
class DDPGAgent:
    def __init__(self, state_size, action_size, num_agents,
                 hidden_actor, hidden_critic, lr_actor, lr_critic,
                 buffer_size, agent_id, use_PER=False, seed=0) -> None:
        super(DDPGAgent, self).__init__()

        self.seed = torch.manual_seed(seed)

        self.actor_local = Actor(state_size, hidden_actor, action_size,
                                 seed=seed).to(device)
        self.actor_target = Actor(state_size, hidden_actor, action_size,
                                  seed=seed).to(device)
        self.critic_local = Critic(state_size, num_agents*action_size,
                                  hidden_critic, 1, seed=seed).to(device)
        self.critic_target = Critic(state_size, num_agents*action_size,
                                    hidden_critic, 1, seed=seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(),
                                    lr= lr_actor)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=lr_actor)
        
        # initialize targets same as original networks
        self.noise = OUNoise(out_actor, scale=1.0 )
        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)


    def act(self, obs, noise):
        obs = obs.to(device)

        if len(obs.shape)==1: 
            obs = obs.unsqueeze(0)

        return self.actor_local(obs) + noise*self.noise.noise()

    def target_act(self, obs, noise):
        obs = obs.to(device)

        if len(obs.shape)==1:
            obs = obs.unsqueeze(0)
        
        return self.actor_target(obs) + noise*self.noise.noise()
Exemplo n.º 3
0
class DDPGAgent():
    """
    Agent that interacts with and learns from the environment.
    
    """
    def __init__(self, state_size, action_size, agent_num, random_seed):
        """
        Initialize an Agent object.
        :param state_size (int): dimension of each state
        :param action_size (int): dimension of each action
        :param random_seed (int): random seed
        """

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size, agent_num,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, agent_num,
                                    random_seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale=0.1)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor_local(obs) + noise * self.noise.sample()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor_target(obs) + noise * self.noise.sample()
        return action
Exemplo n.º 4
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(state_size, action_size).to(DEVICE)
        self.critic = Critic(state_size, action_size, num_agents).to(DEVICE)
        self.target_actor = Actor(state_size, action_size).to(DEVICE)
        self.target_critic = Critic(state_size, action_size,
                                    num_agents).to(DEVICE)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)

    def act(self, states, noise=0.0):
        states = states.to(DEVICE)
        self.actor.eval()
        actions = self.actor(
            states).cpu().data.numpy() + noise * self.noise.noise()
        return np.clip(actions, -1, 1)

    def target_act(self, states, noise=0.0):
        states = states.to(DEVICE)
        self.target_actor.eval()
        actions = self.target_actor(
            states).cpu().data.numpy() + noise * self.noise.noise()
        return np.clip(actions, -1, 1)
Exemplo n.º 5
0
class Agent():
    def __init__(self, state_shape, action_shape, stats):
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.state_shape = state_shape
        self.action_shape = action_shape

        self.stats = stats

        self.learn_rate = 3e-4
        self.num_epochs = 8

        self.entropy_weight = 0.001
        self.kl_clip = 0.1

        self.deterministic_test_mode = False

        self.hidden_state_size = 16
        self.lstm = LSTM(self.state_shape, self.hidden_state_size)
        self.actor = Actor(self.hidden_state_size,
                           self.action_shape).to(self.device)
        self.critic = Critic(self.hidden_state_size).to(self.device)

        self.optimizer = torch.optim.Adam(list(self.actor.parameters()) +
                                          list(self.critic.parameters()),
                                          lr=self.learn_rate)

    def act(self, state):
        with torch.no_grad():
            if self.deterministic_test_mode:
                mu = self.actor.forward_deterministic(state)
                action = mu
            else:
                policy_dist = self.actor(state)
                action = policy_dist.sample()
            action = action.clamp(-1, 1)  #   depends on env
            action = action.cpu().numpy()[0]
            return action

    def learn(self, rollout_collector):
        for _ in range(self.num_epochs):
            for state, action, old_log_probs, advantage, return_ in rollout_collector.random_batch_iter(
            ):
                policy_dist = self.actor(state)
                value = self.critic(state)
                new_log_probs = policy_dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - self.kl_clip,
                                    1.0 + self.kl_clip) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                entropy = policy_dist.entropy().mean()
                loss = 0.5 * critic_loss + actor_loss - self.entropy_weight * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

        self.stats.update_training_stats(
            num_samples_processed_inc=\
                rollout_collector.batch_size * rollout_collector.rollout_length * self.num_epochs)
Exemplo n.º 6
0
class Agent:
    """

    ---
    # test scenario 1.
    >>> env = environment.gym_env(ID)
    >>> agent = Agent(env)
    >>> state = agent.env.reset()

    # >>> agent.select_action(state)

    >>> agent.train(100000)
    """
    def __init__(self, env: gym.Env, hidden_dims=128):
        self.env = env
        obs_dim = env.observation_space.shape[0]  ## TODO 외우기
        action_dim = env.action_space.shape[0]

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = 0.95  # hyperparameter
        self.entropy_weight = 1e-2  # hyperparameter

        self.actor = Actor(obs_dim, action_dim, hidden_dims).to(self.device)
        self.critic = Critic(obs_dim, hidden_dims).to(self.device)  # 넣어주시

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.done = False
        self.score = 0

        self.transition_store = list()

    def select_action(self, state: np.ndarray, train='train'):
        """
        여기 쫌 난해하네
        :param state:
        :param train:
        :return:
        """
        state_tensor = torch.FloatTensor(state).to(self.device)  # 요런거 외우기
        action, dist = self.actor(state_tensor)
        action_map = {'train': action, 'test': dist.mean}
        selected_action = action_map[train]
        log_prob = dist.log_prob(selected_action).sum(
            dim=-1)  # entropy ### sum을 왜 하는겨????

        return selected_action.clamp(-2.0, 2.0).cpu().detach().numpy(
        ), log_prob  #array([-2.], dtype=float32) # detach가 필요한 이유는?
        # selected_action.clamp(-2.0, 2.0).cpu().detach().numpy(),
        # dist.log_prob(action_map)

    def train(
        self, number_frames
    ):  #number frames = 500000 (얘가 총 프레임수인가벼), plotting interval = 100

        state = self.env.reset()
        for i in range(1, number_frames):
            self.env.render()
            action, log_prob = self.select_action(state, 'train')
            next_state, reward, done, info = self.env.step(action)
            self.transition_store.append((state, next_state, reward, done))
            state = next_state
            if done:
                # 얜 당연히 SGD임?
                self.update(self.transition_store)
                self.transition_store = []
                state = self.env.reset()

        self.env.close()

    def update(self, store):
        for experience in store:
            state, next_state, reward, done = experience
            next_state = torch.FloatTensor(next_state).to(self.device)
            state = torch.FloatTensor(state).to(self.device)

            pred_value = self.critic(state)
            targ_value = reward + self.gamma * self.critic(next_state) * (1 -
                                                                          done)
            value_loss = F.smooth_l1_loss(pred_value, targ_value.detach())

            # update value
            self.critic_optimizer.zero_grad()
            value_loss.backward()
            self.critic_optimizer.step()

            # advantage = Q_t - V(s_t)
            _, log_prob = self.select_action(state, 'train')
            advantage = (targ_value -
                         pred_value).detach()  # not backpropagated
            policy_loss = -advantage * log_prob
            policy_loss += self.entropy_weight * -log_prob  # entropy maximization

            # update policy
            self.actor_optimizer.zero_grad()
            policy_loss.backward()
            self.actor_optimizer.step()

        # return policy_loss.item(), value_loss.item()

    def log(self):  #score, actor loss, critic loss to tensorboard
        NotImplemented
        # tensorboard에 기록 남기기

    def check(self):
        NotImplemented
        # 중간중간 저장하기?

    def test(self):
        NotImplemented
Exemplo n.º 7
0
import torch
from torch import optim
from tqdm import tqdm
from hyperparams import OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START
from env import Env
from models import Critic, SoftActor, create_target_network, update_target_network
from utils import plot

env = Env()
actor = SoftActor(HIDDEN_SIZE)
critic_1 = Critic(HIDDEN_SIZE, state_action=True)
critic_2 = Critic(HIDDEN_SIZE, state_action=True)
value_critic = Critic(HIDDEN_SIZE)
target_value_critic = create_target_network(value_critic)
actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
critics_optimiser = optim.Adam(list(critic_1.parameters()) +
                               list(critic_2.parameters()),
                               lr=LEARNING_RATE)
value_critic_optimiser = optim.Adam(value_critic.parameters(),
                                    lr=LEARNING_RATE)
D = deque(maxlen=REPLAY_SIZE)


def test(actor):
    with torch.no_grad():
        env = Env()
        state, done, total_reward = env.reset(), False, 0
        while not done:
            action = actor(
                state).mean  # Use purely exploitative policy at test time
            state, reward, done = env.step(action)
Exemplo n.º 8
0
class DDPG(object):
    def __init__(self,
                 n_s,
                 n_a,
                 a_bound,
                 gamma=0.99,
                 memory_size=10000,
                 tau=0.01,
                 lr_a=0.001,
                 lr_c=0.002,
                 batch_size=64,
                 var=3,
                 var_decay=0.9995):
        self.n_s = n_s
        self.n_a = n_a
        self.a_bound = a_bound
        self.gamma = gamma
        self.memory_size = memory_size
        self.tau = tau
        self.batch_size = batch_size
        self.var = var
        self.var_decay = var_decay

        # memory
        self.replay_buffer = ReplayBuffer(n_s, n_a, memory_size)
        # actor
        self.eval_actor = Actor(n_s, n_a, a_bound)
        self.target_actor = deepcopy(self.eval_actor)
        self.actor_optim = torch.optim.Adam(self.eval_actor.parameters(),
                                            lr=lr_a)

        # critic
        self.eval_critic = Critic(n_s, n_a)
        self.target_critic = deepcopy(self.eval_critic)
        self.critic_optim = torch.optim.Adam(self.eval_critic.parameters(),
                                             lr=lr_c)

    def choose_action(self, s):
        s = torch.FloatTensor(s).unsqueeze(0)
        action = self.eval_actor(s).detach().numpy()[0]
        a = np.clip(np.random.normal(action, self.var), -self.a_bound,
                    self.a_bound)
        return a

    def step(self, s, a, r, s_, done):
        self.store(s, a, r, s_, done)
        if self.replay_buffer.memory_count < self.memory_size:
            return
        # start learn
        self._learn()

    def _learn(self):
        # get batch
        mini_batch = self.replay_buffer.sample(self.batch_size)
        b_s = torch.FloatTensor(mini_batch[:, :self.n_s])
        b_a = torch.FloatTensor(mini_batch[:, self.n_s:self.n_s + self.n_a])
        b_r = torch.FloatTensor(mini_batch[:, self.n_s + self.n_a:self.n_s +
                                           self.n_a + 2])
        b_s_ = torch.FloatTensor(mini_batch[:, -self.n_s:])
        b_done = torch.FloatTensor(mini_batch[:, self.n_s + self.n_a +
                                              2:self.n_s + self.n_a + 3])
        # learn
        self.update_critic(b_s, b_a, b_r, b_s_, b_done)
        self.update_actor(b_s)
        self.var *= self.var_decay

    def update_critic(self, s, a, r, s_, done):
        with torch.no_grad():
            target_next_a = self.target_actor(s_)
            next_a = self.target_critic(s_, target_next_a)
            target_q = torch.mean(r) + self.gamma * next_a * (1.0 - done)
        eval_q = self.eval_critic(s, a)
        critic_loss = F.mse_loss(eval_q, target_q)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()
        self._soft_update(self.eval_critic, self.target_critic)

    def update_actor(self, s):
        action = self.eval_actor(s)
        actor_loss = -self.eval_critic(s, action).mean()
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()
        self._soft_update(self.eval_actor, self.target_actor)

    def store(self, s, a, r, s_, done):
        self.replay_buffer.store(s, a, r, s_, done)

    def _soft_update(self, eval_net, target_net):
        for eval, target in zip(eval_net.parameters(),
                                target_net.parameters()):
            target.data.copy_(self.tau * eval.data +
                              (1.0 - self.tau) * target.data)

    # save all net
    def save(self, name):
        torch.save(self.eval_actor, '{}_actor.pt'.format(name))
        torch.save(self.eval_critic, '{}_critic.pt'.format(name))

    # load all net
    def load(self, name):
        actor = torch.load('{}_actor.pt'.format(name))
        critic = torch.load('{}_critic.pt'.format(name))
        return actor, critic
Exemplo n.º 9
0
def main():
    order_book_id_number = 10
    toy_data = create_toy_data(order_book_ids_number=order_book_id_number,
                               feature_number=20,
                               start="2019-05-01",
                               end="2019-12-12",
                               frequency="D")
    env = PortfolioTradingGym(data_df=toy_data,
                              sequence_window=5,
                              add_cash=True)
    env = Numpy(env)
    env = ch.envs.Logger(env, interval=1000)
    env = ch.envs.Torch(env)
    env = ch.envs.Runner(env)

    # create net
    action_size = env.action_space.shape[0]
    number_asset, seq_window, features_number = env.observation_space.shape

    input_size = features_number

    actor = Actor(input_size=input_size,
                  hidden_size=50,
                  action_size=action_size)
    critic = Critic(input_size=input_size,
                    hidden_size=50,
                    action_size=action_size)

    target_actor = create_target_network(actor)
    target_critic = create_target_network(critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR)
    critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC)
    replay = ch.ExperienceReplay()
    ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size))

    def get_action(state):
        action = actor(state)
        action = action + ou_noise()[0]
        return action

    def get_random_action(state):
        action = torch.softmax(torch.randn(action_size), dim=0)
        return action

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():

            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)

        replay = replay[-REPLAY_SIZE:]
        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            next_values = target_critic(batch.next_state(),
                                        target_actor(batch.next_state())).view(
                                            -1, 1)
            values = critic(batch.state(), batch.action()).view(-1, 1)
            rewards = ch.normalize(batch.reward())
            #rewards = batch.reward()/100.0   change the convergency a lot
            value_loss = ch.algorithms.ddpg.state_value_loss(
                values, next_values.detach(), rewards, batch.done(), DISCOUNT)
            critic_optimiser.zero_grad()
            value_loss.backward()
            critic_optimiser.step()

            # Update policy by one step of gradient ascent
            policy_loss = -critic(batch.state(), actor(batch.state())).mean()
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()

            # Update target networks
            ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR)
            ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
Exemplo n.º 10
0
class DDPG:
    """Implementation of DDPG.

    This implementation is adapted to this particular environment running several agent.
    At each time step, the same actor is controlling each agent sequentially.
    """

    def __init__(self, state_size, action_size, config):
        """Initialize algorithm."""
        if config.PER:
            self.memory = PrioritizeReplayBuffer(
                config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED
            )
        else:
            self.memory = ReplayBuffer(
                config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED
            )

        # Randomly initialize critic netowrk and actor
        self.actor = Actor(state_size, action_size, config.SEED).to(device)
        self.critic = Critic(state_size, action_size, config.SEED).to(device)

        # Initialize target networks with weights from actor critic
        # Actor
        self.actor_target = Actor(state_size, action_size, config.SEED).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        # Critic
        self.critic_target = Critic(state_size, action_size, config.SEED).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # Actor optimizer
        self.actor_optimizer = torch.optim.Adam(
            self.actor.parameters(), lr=config.LR_ACTOR
        )
        # Critic optimizer
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(), lr=config.LR_CRITIC
        )

        self.config = config

        self.t_step = 0

        self.expl_noise = config.EXPL_NOISE

    def step(self, target_sample=None, **kwargs):
        """Run a step of algorithm update."""
        # Sample a random minibatch of transitions
        states, actions, rewards, next_states, dones = self._draw_minibatch()

        # Compute the target Q value
        target_Q = self.critic_target(
            next_states, self.actor_target(next_states)
        ).detach()
        y = rewards + (1 - dones) * self.config.GAMMA * target_Q

        # Update critic by minimizing the loss
        current_Q = self.critic(states, actions)

        # Compute TD error
        td_error = y - current_Q

        if self.config.PER:
            # Get importance_sampling_weights
            weights = torch.Tensor(self.memory.importance_sampling()).unsqueeze(1)
            # Update priorities
            self.memory.update_priorities(td_error.detach().cpu().numpy())
            # Compute critic loss
            critic_loss = torch.mean(weights * td_error ** 2)
        else:
            # Compute critic loss
            critic_loss = torch.mean(td_error ** 2)

        # Optimize critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradient
        nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        # Update the actor policy using the sampled policy gradient:
        actor_loss = -self.critic(states, self.actor(states)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # CLip gradient
        nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update()

    def train(self, env, num_episode):
        """Train a DDPG agent."""
        scores = []
        scores_window = deque(maxlen=100)

        for episode in range(num_episode):
            # Init state and episode score
            states = env.reset(train_mode=True)
            score = np.zeros(states.shape[0])
            done = False

            # Run episode
            while not done:
                # Select and run action
                actions = self.predict_actions(states)
                # TODO: dynamic low and high selection
                actions = self.add_gaussian_noise(actions, -1, 1)
                next_states, rewards, dones = env.step(actions)

                # Store all n_agent episodes in replay buffer
                for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones
                ):
                    self.memory.add(state, action, reward, next_state, done)

                # Update time step
                self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

                # Optimisation step if UPDATE_EVERY and enough examples in memory
                if self.t_step == 0 and len(self.memory) > self.config.BATCH_SIZE:
                    for _ in range(self.config.UPDATE_STEPS):
                        self.step()

                # Update state and scores
                states = next_states
                score += rewards

                # End episode if any of the agent is done, to avoid storing too much
                # Done transitions in the replay buffer
                done = any(dones)

            # Keep track of running mean
            scores_window.append(max(score))

            # Append current mean to scores list
            scores.append(np.mean(scores_window))

            # Logging
            print(
                "\rEpisode {}\tAverage Score: {:.2f}, Last Score: {:.2f}".format(
                    episode, np.mean(scores_window), max(score)
                ),
                end="",
            )
            if (episode + 1) % 100 == 0:
                print(
                    "\rEpisode {}\tAverage Score: {:.2f}".format(
                        episode, np.mean(scores_window)
                    )
                )

        return scores

    def soft_update(self):
        """Update the frozen target models."""
        tau = self.config.TAU
        # Actor
        for param, target_param in zip(
            self.critic.parameters(), self.critic_target.parameters()
        ):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        # Critic
        for param, target_param in zip(
            self.actor.parameters(), self.actor_target.parameters()
        ):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def predict_actions(self, states, **kwargs):
        """Predict next actions based on current policy."""
        states = torch.from_numpy(states).float().unsqueeze(0).to(device)

        # Set actor to eval mode
        self.actor.eval()

        actions = []
        with torch.no_grad():
            for state in states:
                action = self.actor(state)
                actions.append(action.detach().numpy())

        # Set actor to train mode
        self.actor.train()

        return np.array(actions).squeeze()

    def add_gaussian_noise(self, action, low, high):
        """Add Gaussian noise to action, and clip between low and high."""
        return (action + np.random.normal(0, self.expl_noise, size=action.shape)).clip(
            low, high
        )

    def _draw_minibatch(self):
        """Draw a minibatch in the replay buffer."""
        states, actions, rewards, next_states, done = zip(*self.memory.sample())

        states = torch.Tensor(states).to(device)
        actions = torch.Tensor(actions).to(device)
        rewards = torch.Tensor(rewards).unsqueeze(1).to(device)
        next_states = torch.Tensor(next_states).to(device)
        done = torch.Tensor(done).unsqueeze(1).to(device)

        return states, actions, rewards, next_states, done

    def save_model(self, path, **kwargs):
        """Save actor model weights."""
        torch.save(self.actor.state_dict(), path)
Exemplo n.º 11
0
config.device = device.type
print(device)

actor = Actor(obs_space=config.obs_space,
              action_space=config.action_space,
              hidden_size=config.hidden_size).to(device)
critic = Critic(obs_space=config.obs_space,
                hidden_size=config.hidden_size).to(device)
# actor.load_state_dict(torch.load('actor_model.h5'))
# critic.load_state_dict(torch.load('critic_model.h5'))

wandb.watch(actor)
wandb.watch(critic)

optimizer_actor = Adam(actor.parameters(), lr=config.actor_lr)
optimizer_critic = Adam(critic.parameters(), lr=config.critic_lr)
memory = Memory(env.agent_ids)


def compute_GAE(rewards, state_values, done, gamma, lamb):
    """
        Computes Generalized Advantage Estimations.
    """
    returns = [rewards[-1] + state_values[-1]]
    running_sum = rewards[-1] - state_values[-1]
    for i in reversed(range(len(rewards) - 1)):
        mask = 0 if done[i + 1] else 1
        delta = rewards[i] + gamma * state_values[i +
                                                  1] * mask - state_values[i]
        running_sum = delta + gamma * lamb * running_sum * mask
        returns.insert(0, running_sum + state_values[i])
Exemplo n.º 12
0
class TD3Agent():
    def __init__(self, env: object, gamma: float, delay_step: int, tau: float,
                 buffer_maxlen: int, noise_std: float, noise_bound: float,
                 critic_lr: float, actor_lr: float):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the state
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Total_step initialization
        self.steps = 0

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.delay_step = delay_step

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic1 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic2 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # copy weight parameters to the target Q network and actor network
        for target_param, param in zip(self.target_critic1.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_critic2.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=self.critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(),
                                            lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int, steps: int):
        self.steps = steps

        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss
        # Sample actions for the next states (s_t+1) using the target actor
        next_actions = self.target_actor.forward(next_states)
        next_actions = self.rescale_action(next_actions)

        # Adding gaussian noise to the actions
        noise = self.get_noise(next_actions, self.noise_std + 0.1,
                               -self.noise_bound, self.noise_bound)
        noisy_next_actions = next_actions + noise

        # Compute Q(s_t+1,a_t+1)
        next_q1 = self.target_critic1(next_states, noisy_next_actions)
        next_q2 = self.target_critic2(next_states, noisy_next_actions)

        # Choose minimum Q
        min_q = torch.min(next_q1, next_q2)

        # Find expected Q, i.e., r(t) + gamma*next_q
        expected_q = rewards + (1 - dones) * self.gamma * min_q

        # Find current Q values for the given states and actions from replay buffer
        curr_q1 = self.critic1.forward(states, actions)
        curr_q2 = self.critic2.forward(states, actions)

        # Compute loss between Q network and expected Q
        critic1_loss = F.mse_loss(curr_q1, expected_q.detach())
        critic2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # actor update (computing the loss)

        if self.steps % self.delay_step == 0:
            # Sample new actions for the current states (s_t) using the current actor
            new_actions = self.actor.forward(states)

            # Compute Q(s_t,a_t)
            new_q1 = self.critic1.forward(states, new_actions)

            # Compute the actor loss, i.e., -Q1
            actor_loss = -new_q1.mean()

            # Backpropagate the losses and update actor network parameters
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the target networks
            for target_param, param in zip(self.target_critic1.parameters(),
                                           self.critic1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_critic2.parameters(),
                                           self.critic2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_actor.parameters(),
                                           self.actor.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

    def get_noise(self, action: torch.Tensor, sigma: float, bottom: float,
                  top: float) -> torch.Tensor:
        # sigma: standard deviation of the noise
        # bottom,top: minimum and maximum values for the given noiuse
        return torch.normal(torch.zeros(action.size()),
                            sigma).clamp(bottom, top).to(self.device)

    def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action)
        # Convert state numpy to tensor
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)

        if stochastic:
            # Add gaussian noise to the rescaled action
            action = self.rescale_action(action) + self.get_noise(
                action, self.noise_std, -self.noise_bound, self.noise_bound)
        else:
            action = self.rescale_action(action)

        # Convert action tensor to numpy
        action = action.squeeze(0).cpu().detach().numpy()
        return action

    def rescale_action(self, action: torch.Tensor) -> torch.Tensor:
        # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"
        torch.save(self.actor.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"  # Best
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor.load_state_dict(torch.load(savePath))
Exemplo n.º 13
0
class Agent:
    """Interacts with and learns from the environment."""
    
    def __init__(self, config: ac_parm, device, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.seed = random.seed(random_seed)
        self.name = config.name
        self.device = device

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config, random_seed).to(device)
        self.actor_target = Actor(config, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config, random_seed).to(device)
        self.critic_target = Critic(config, random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(config.action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(config, device, random_seed)

        self.step_number = 0
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.step_number += 1
        for (s, a, r, ns, d) in zip(state, action, reward, next_state, done):
            self.memory.add(s, a, r, ns, d)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.config.batch_size and self.step_number % self.config.learn_every == 0:
            experiences = self.memory.sample()
            self.learn(experiences, self.config.gamma)

    def act(self, state):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(state).float().to(self.device)
        actions = np.zeros((state.shape[0], self.config.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if self.config.noise_enabled:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.config.gradient_clipping:
            torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 actor_file=None,
                 critic_file=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            actor_file: path of file containing trained weights of actor network
            critic_file: path of file containing trained weights of critic network
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        #actor network:
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(), LR)

        #critic network
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), LR)

        #load trained weights if needed
        if actor_file:
            weights = torch.load(actor_file)
            self.actor_local.load_state_dict(weights)
            self.actor_target.load_state_dict(weights)

        if critic_file:
            weights = torch.load(critic_file)
            self.critic_local.load_state_dict(weights)
            self.critic_target.load_state_dict(weights)

        #init replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def act(self, state):
        """Returns actions for given state as per current Actor network.
        
        Params
        ======
            state (array_like): current state
            
        """
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                self.learn(GAMMA)

    def learn(self, GAMMA):
        """Update value parameters using batch of experience tuples.
        Params
        ======
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = self.memory.sample()

        #update critic
        target_next_actions = self.actor_target(next_states)
        target_next_q = self.critic_target(next_states, target_next_actions)
        target_q = rewards + (GAMMA * target_next_q * (1 - dones))

        local_q = self.critic_local(states, actions)

        critic_loss = F.mse_loss(local_q, target_q)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        #update actor
        local_actions = self.actor_local(states)

        actor_loss = -self.critic_local(states, local_actions).mean()
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model, tau=TAU):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 15
0
class AgentDDPG:
    """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks"""
    def __init__(self,
                 state_size,
                 hidden_size,
                 action_size,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 use_cuda=False,
                 actor_path=None,
                 critic_path=None):
        # Params
        self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size
        self.gamma, self.tau = gamma, tau
        self.use_cuda = use_cuda

        # Networks
        self.actor = Actor(state_size, hidden_size, action_size)
        self.actor_target = Actor(state_size, hidden_size, action_size)

        self.critic = Critic(state_size + action_size, hidden_size,
                             action_size)
        self.critic_target = Critic(state_size + action_size, hidden_size,
                                    action_size)

        # Load model state_dicts from saved file
        if actor_path and path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        if critic_path and path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

        # Hard copy params from original networks to target networks
        copy_params(self.actor, self.actor_target)
        copy_params(self.critic, self.critic_target)

        if self.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        # Create replay buffer for storing experience
        self.replay_buffer = ReplayBuffer(cache_size=int(1e6))

        # Training
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)

    def save_to_file(self, actor_file, critic_file):
        # Save the state_dict's of the Actor and Critic networks
        torch.save(self.actor.state_dict(), actor_file)
        torch.save(self.critic.state_dict(), critic_file)

    def get_action(self, state):
        """Select action with respect to state according to current policy and exploration noise"""
        state = Variable(torch.from_numpy(state).float())

        if self.use_cuda:
            state = state.cuda()

        a = self.actor.forward(state)

        if self.use_cuda:
            return a.detach().cpu().numpy()

        return a.detach().numpy()

    def save_experience(self, state_t, action_t, reward_t, state_t1):
        self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1)

    def update(self, batch_size):
        states, actions, rewards, next_states = self.replay_buffer.get_samples(
            batch_size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        if self.use_cuda:
            states = states.cuda()
            next_states = next_states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()

        # Critic loss
        Qvals = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_Q = self.critic_target.forward(next_states, next_actions.detach())
        Qprime = rewards + self.gamma * next_Q
        critic_loss = self.critic_criterion(Qvals, Qprime)

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        policy_loss = -self.critic.forward(states,
                                           self.actor.forward(states)).mean()

        # Update actor
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        soft_copy_params(self.actor, self.actor_target, self.tau)
        soft_copy_params(self.critic, self.critic_target, self.tau)

    def add_noise_to_weights(self, amount=0.1):
        self.actor.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.actor_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
Exemplo n.º 16
0
class PPO(BaseAgent):
    def __init__(self, config):
        super(PPO, self).__init__()
        self.config = config
        torch.manual_seed(self.config['seed'])
        np.random.seed(self.config['seed'])

        if self.config['experiment'][
                'orthogonal_initialization_and_layer_scaling']:
            weight_init_scheme = 'orthogonal'
        else:
            weight_init_scheme = 'normal'

        self.actor = Actor(
            device=self.config['device'],
            input_dim=self.config['env']['nS'],
            output_dim=self.config['env']['nA'],
            hidden_dims=self.config['model']['actor']['hidden_dims'],
            hidden_activation_fn=self.config['model']['actor']
            ['hidden_acivation_fn'],
            weight_init_scheme=weight_init_scheme)
        self.actor_optimizer = optim.Adam(
            self.actor.parameters(),
            lr=self.config['model']['actor']['lr'],
            betas=self.config['model']['actor']['betas'])

        self.critic = Critic(
            device=self.config['device'],
            input_dim=self.config['env']['nS'],
            hidden_dims=self.config['model']['critic']['hidden_dims'],
            hidden_activation_fn=self.config['model']['critic']
            ['hidden_acivation_fn'],
            weight_init_scheme=weight_init_scheme)
        self.critic_optimizer = optim.Adam(
            self.critic.parameters(),
            lr=self.config['model']['critic']['lr'],
            betas=self.config['model']['critic']['betas'])

        if self.config['train']['gail']:
            self.discriminator = Discriminator(
                device=self.config['device'],
                state_dim=self.config['env']['nS'],
                action_dim=self.config['env']['nA'],
                hidden_dims=self.config['model']['discriminator']
                ['hidden_dims'],
                hidden_activation_fn=self.config['model']['discriminator']
                ['hidden_acivation_fn'],
                weight_init_scheme=weight_init_scheme)
            self.discriminator_optimizer = optim.Adam(
                self.discriminator.parameters(),
                lr=self.config['model']['discriminator']['lr'],
                betas=self.config['model']['discriminator']['betas'])

        # [EXPERIMENT] - reward scaler: r / rs.std()
        if self.config['experiment']['reward_standardization']:
            self.reward_scaler = RewardScaler(
                gamma=self.config['train']['gamma'])

        # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
        if self.config['experiment']['observation_normalization']:
            self.observation_scaler = ObservationScaler()

    # train
    def train(self):
        """
        # initialize env, memory
        # foreach episode
        #   foreach timestep
        #     select action
        #     step action
        #     add exp to the memory
        #     if done or timeout or memory_full: update gae & tdlamret
        #     if memory is full
        #       bootstrap value
        #       optimize
        #       clear memory
        #     if done:
        #       wrapup episode
        #       break
        """
        writer_path = os.path.join('experiments', self.config['exp_name'],
                                   'runs')
        self.writer = SummaryWriter(writer_path)

        # Pretrain with BC
        if self.config['train']['bc']:
            bc_train_set, bc_valid_set = get_bc_dataset(
                self.config['train']['bc']['samples_exp_name'],
                self.config['train']['bc']['minimum_score'],
                self.config['train']['bc']['batch_size'],
                self.config['train']['bc']['demo_count'],
                self.config['train']['bc']['val_size'])

            if self.config['experiment']['observation_normalization']:
                use_obs_scaler = True
            else:
                use_obs_scaler = False

            self.actor = pretrain(self.actor,
                                  self.config['train']['bc']['lr'],
                                  self.config['train']['bc']['epochs'],
                                  bc_train_set,
                                  bc_valid_set,
                                  use_obs_scaler,
                                  writer=self.writer)

        # GAIL
        if self.config['train']['gail']:
            self.expert_dataset = get_gail_dataset(
                self.config['train']['gail']['samples_exp_name'],
                self.config['train']['gail']['minimum_score'],
                self.config['train']['gail']['n_samples'],
                self.config['train']['ppo']['memory_size'],
                self.config['train']['gail']['dstep'])

        self.best_score = 0

        # prepare env, memory, stuff
        env = self.init_env(self.config['env']['name'])
        env.seed(self.config['seed'])
        self.memory = PPOMemory(gamma=self.config['train']['gamma'],
                                tau=self.config['train']['gae']['tau'])
        score_queue = deque(maxlen=self.config['train']['average_interval'])
        length_queue = deque(maxlen=self.config['train']['average_interval'])
        if self.config['train']['gail']:
            irl_score_queue = deque(
                maxlen=self.config['train']['average_interval'])

        for episode in trange(1, self.config['train']['max_episodes'] + 1):
            self.episode = episode
            episode_score = 0
            if self.config['train']['gail']:
                irl_episode_score = 0

            # reset env
            state = env.reset()

            for t in range(1,
                           self.config['train']['max_steps_per_episode'] + 1):
                if self.episode % 100 == 0:
                    env.render()

                # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
                if self.config['experiment']['observation_normalization']:
                    state = self.observation_scaler(state, update=True)

                # select action & estimate value from the state
                with torch.no_grad():
                    state_tensor = torch.tensor(state).unsqueeze(
                        0).float()  # bsz = 1
                    action_tensor, logpa_tensor = self.actor.select_action(
                        state_tensor)
                    value_tensor = self.critic(state_tensor).squeeze(
                        1)  # don't need bsz dim

                # step action
                action = action_tensor.numpy()[0]  # single worker
                next_state, reward, done, _ = env.step(action)

                # update episode_score
                episode_score += reward

                # GAIL: get irl_reward
                if self.config['train']['gail']:
                    with torch.no_grad():
                        reward = self.discriminator.get_irl_reward(
                            state_tensor, action_tensor).detach()
                        irl_episode_score += reward

                # [EXPERIMENT] - reward scaler r / rs.std()
                if self.config['experiment']['reward_standardization']:
                    reward = self.reward_scaler(reward, update=True)

                # [EXPERIMENT] - reward clipping [-5, 5]
                if self.config['experiment']['reward_clipping']:
                    reward = np.clip(reward, -5, 5)

                # add experience to the memory
                self.memory.store(s=state,
                                  a=action,
                                  r=reward,
                                  v=value_tensor.item(),
                                  lp=logpa_tensor.item())

                # done or timeout or memory full
                # done => v = 0
                # timeout or memory full => v = critic(next_state)
                # update gae & return in the memory!!
                timeout = t == self.config['train']['max_steps_per_episode']
                time_to_optimize = len(
                    self.memory) == self.config['train']['ppo']['memory_size']
                if done or timeout or time_to_optimize:
                    if done:
                        # cuz the game is over, value of the next state is 0
                        v = 0
                    else:
                        # if not, estimate it with the critic
                        next_state_tensor = torch.tensor(next_state).unsqueeze(
                            0).float()  # bsz = 1
                        with torch.no_grad():
                            next_value_tensor = self.critic(
                                next_state_tensor).squeeze(1)
                        v = next_value_tensor.item()

                    # update gae & tdlamret
                    self.memory.finish_path(v)

                # if memory is full, optimize PPO
                if time_to_optimize:
                    self.optimize()

                if done:
                    score_queue.append(episode_score)
                    length_queue.append(t)
                    if self.config['train']['gail']:
                        irl_score_queue.append(irl_episode_score)
                    break

                # update state
                state = next_state

            avg_score = np.mean(score_queue)
            std_score = np.std(score_queue)
            avg_duration = np.mean(length_queue)
            self.writer.add_scalar("info/score", avg_score, self.episode)
            self.writer.add_scalar("info/duration", avg_duration, self.episode)

            if self.config['train']['gail']:
                avg_score = np.mean(irl_score_queue)
                self.writer.add_scalar("info/irl_score", avg_score,
                                       self.episode)

            if self.episode % 100 == 0:
                print("{} - score: {:.1f} +-{:.1f} \t duration: {}".format(
                    self.episode, avg_score, std_score, avg_duration))

            # game-solved condition
            # if avg_score >= self.config['train']['terminal_score']:
            #     print("game solved at ep {}".format(self.episode))
            #     self.save_weight(self.actor, self.config['exp_name'], "best")
            #     break
            if avg_score >= self.best_score and self.episode >= 200:
                print("found best model at episode: {}".format(self.episode))
                self.save_weight(self.actor, self.config['exp_name'], "best")
                self.best_score = avg_score

                # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
                if self.config['experiment']['observation_normalization']:
                    self.observation_scaler.save(self.config['exp_name'])

        self.save_weight(self.actor, self.config['exp_name'], "last")
        return self.best_score

    # optimize
    def optimize(self):
        data = self.prepare_data(self.memory.get())

        # gail
        if self.config['train']['gail']:
            self.optimize_gail(data)

        self.optimize_ppo(data)

    def prepare_data(self, data):
        states_tensor = torch.from_numpy(np.stack(
            data['states'])).float()  # bsz, 8
        actions_tensor = torch.tensor(data['actions']).long()  # bsz
        logpas_tensor = torch.tensor(data['logpas']).float()  # bsz
        tdlamret_tensor = torch.tensor(data['tdlamret']).float()  # bsz
        advants_tensor = torch.tensor(data['advants']).float()  # bsz
        values_tensor = torch.tensor(data['values']).float()  # bsz

        # normalize advant a.k.a atarg
        advants_tensor = (advants_tensor - advants_tensor.mean()) / (
            advants_tensor.std() + 1e-5)

        data_tensor = dict(states=states_tensor,
                           actions=actions_tensor,
                           logpas=logpas_tensor,
                           tdlamret=tdlamret_tensor,
                           advants=advants_tensor,
                           values=values_tensor)

        return data_tensor

    def ppo_iter(self, batch_size, ob, ac, oldpas, atarg, tdlamret,
                 vpredbefore):
        total_size = ob.size(0)
        indices = np.arange(total_size)
        np.random.shuffle(indices)
        n_batches = total_size // batch_size
        for nb in range(n_batches):
            ind = indices[batch_size * nb:batch_size * (nb + 1)]
            yield ob[ind], ac[ind], oldpas[ind], atarg[ind], tdlamret[
                ind], vpredbefore[ind]

    def optimize_gail(self, data):
        """
        https://github.com/openai/baselines/blob/master/baselines/gail/trpo_mpi.py
        bsz = learner_batch_size // d_step
        for each ob_batch, ac_batch in learner_dataset:
            get ob_expert, ac_expert from expert_dataset
            get learner_logit from D
            get expert_logit from D
            get learner loss vs. torch.ones()
            get expert loss vs. torch.zeros()
            update D
        """
        loss_fn = nn.BCELoss()
        D_losses = []
        learner_accuracies = []
        expert_accuracies = []

        learner_ob = data['states']
        learner_ac = data['actions']
        rub = torch.zeros_like(
            learner_ob)  # not doing anything.. just wanted to reuse ppo_iter()
        learner_iter = self.ppo_iter(self.expert_dataset.batch_size,
                                     learner_ob, learner_ac, rub, rub, rub,
                                     rub)
        for learner_ob_b, learner_ac_b, _, _, _, _ in learner_iter:
            expert_ob_b, expert_ac_b = self.expert_dataset.get_next_batch()
            if self.config['experiment']['observation_normalization']:
                expert_ob_b = self.observation_scaler(expert_ob_b,
                                                      update=False).float()

            learner_logit = self.discriminator.forward(learner_ob_b,
                                                       learner_ac_b)
            learner_prob = torch.sigmoid(learner_logit)

            expert_logit = self.discriminator.forward(expert_ob_b, expert_ac_b)
            expert_prob = torch.sigmoid(expert_logit)

            learner_loss = loss_fn(learner_prob, torch.ones_like(learner_prob))
            expert_loss = loss_fn(expert_prob, torch.zeros_like(expert_prob))

            loss = learner_loss + expert_loss
            D_losses.append(loss.item())

            self.discriminator_optimizer.zero_grad()
            loss.backward()
            self.discriminator_optimizer.step()

            learner_acc = ((learner_prob >= 0.5).float().mean().item())
            expert_acc = ((expert_prob < 0.5).float().mean().item())

            learner_accuracies.append(learner_acc)
            expert_accuracies.append(expert_acc)

        avg_d_loss = np.mean(D_losses)
        avg_learner_accuracy = np.mean(learner_accuracies)
        avg_expert_accuracy = np.mean(expert_accuracies)

        self.writer.add_scalar("info/discrim_loss", avg_d_loss, self.episode)
        self.writer.add_scalars("info/gail_accuracy", {
            'learner': avg_learner_accuracy,
            'expert': avg_expert_accuracy
        }, self.episode)

    def optimize_ppo(self, data):
        """
        https://github.com/openai/baselines/blob/master/baselines/ppo1/pposgd_simple.py line 164

        # get data from the memory
        # prepare dataloader
        # foreach optim_epochs
        #   foreach batch
        #     calculate loss and gradient
        #     update nn
        """

        ob = data['states']
        ac = data['actions']
        oldpas = data['logpas']
        atarg = data['advants']
        tdlamret = data['tdlamret']
        vpredbefore = data['values']

        # can't be arsed..
        eps = self.config['train']['ppo']['clip_range']

        policy_losses = []
        entropy_losses = []
        value_losses = []

        # foreach policy_update_epochs
        for i in range(self.config['train']['ppo']['optim_epochs']):
            # foreach batch
            data_loader = self.ppo_iter(
                self.config['train']['ppo']['batch_size'], ob, ac, oldpas,
                atarg, tdlamret, vpredbefore)
            for batch in data_loader:
                ob_b, ac_b, old_logpas_b, atarg_b, vtarg_b, old_vpred_b = batch

                # policy loss
                cur_logpas, cur_entropies = self.actor.get_predictions(
                    ob_b, ac_b)
                ratio = torch.exp(cur_logpas - old_logpas_b)

                # clip ratio
                clipped_ratio = torch.clamp(ratio, 1. - eps, 1. + eps)

                # policy_loss
                surr1 = ratio * atarg_b

                if self.config['experiment']['policy_noclip']:
                    pol_surr = -surr1.mean()
                else:
                    surr2 = clipped_ratio * atarg_b
                    pol_surr = -torch.min(surr1, surr2).mean()

                # value_loss
                cur_vpred = self.critic(ob_b).squeeze(1)

                # [EXPERIMENT] - value clipping: clipped_value = old_values + (curr_values - old_values).clip(-eps, +eps)
                if self.config['experiment']['value_clipping']:
                    cur_vpred_clipped = old_vpred_b + (
                        cur_vpred - old_vpred_b).clamp(-eps, eps)
                    vloss1 = (cur_vpred - vtarg_b).pow(2)
                    vloss2 = (cur_vpred_clipped - vtarg_b).pow(2)
                    vf_loss = torch.max(vloss1, vloss2).mean()
                else:
                    # original value_loss
                    vf_loss = (cur_vpred - vtarg_b).pow(2).mean()

                # entropy_loss
                pol_entpen = -cur_entropies.mean()

                # total loss
                c1 = self.config['train']['ppo']['coef_vf']
                c2 = self.config['train']['ppo']['coef_entpen']

                # actor - backward
                self.actor_optimizer.zero_grad()
                policy_loss = pol_surr + c2 * pol_entpen
                policy_loss.backward()

                # [EXPERIMENT] - clipping gradient with max_norm=0.5
                if self.config['experiment']['clipping_gradient']:
                    nn.utils.clip_grad_norm_(self.actor.parameters(),
                                             max_norm=0.5)

                self.actor_optimizer.step()

                # critic - backward
                self.critic_optimizer.zero_grad()
                value_loss = c1 * vf_loss
                value_loss.backward()

                # [EXPERIMENT] - clipping gradient with max_norm=0.5
                if self.config['experiment']['clipping_gradient']:

                    nn.utils.clip_grad_norm_(self.critic.parameters(),
                                             max_norm=0.5)

                self.critic_optimizer.step()

                policy_losses.append(pol_surr.item())
                entropy_losses.append(pol_entpen.item())
                value_losses.append(vf_loss.item())

        avg_policy_loss = np.mean(policy_losses)
        avg_value_losses = np.mean(value_losses)
        avg_entropy_losses = np.mean(entropy_losses)

        self.writer.add_scalar("info/policy_loss", avg_policy_loss,
                               self.episode)
        self.writer.add_scalar("info/value_loss", avg_value_losses,
                               self.episode)
        self.writer.add_scalar("info/entropy_loss", avg_entropy_losses,
                               self.episode)

    # play
    def play(self,
             num_episodes=1,
             save_traj=False,
             seed=9999,
             record=False,
             save_result=False):

        # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
        if self.config['experiment']['observation_normalization']:
            self.observation_scaler.load(self.config['exp_name'])

        # load policy
        self.load_weight(self.actor, self.config['exp_name'])

        env = self.init_env(self.config['env']['name'])
        env.seed(seed)
        if record:
            from gym import wrappers
            rec_dir = os.path.join("experiments", self.config['exp_name'],
                                   "seed_{}".format(seed))
            env = wrappers.Monitor(env, rec_dir, force=True)
        scores, trajectories = [], []

        for episode in range(num_episodes):
            current_trajectory = []
            episode_score = 0

            # initialize env
            state = env.reset()

            while True:
                # env.render()

                # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
                if self.config['experiment']['observation_normalization']:
                    state = self.observation_scaler(state, update=False)

                # select greedy action
                with torch.no_grad():
                    action_tensor = self.actor.select_greedy_action(state)
                action = action_tensor.numpy()[0]  # single env

                current_trajectory.append((state, action))

                # run action
                next_state, reward, done, _ = env.step(action)

                # add reward
                episode_score += reward

                # update state
                state = next_state

                # game over condition
                if done:
                    scores.append(episode_score)
                    trajectories.append((current_trajectory, episode_score))
                    break

        avg_score = np.mean(scores)
        print("Average score {} on {} games".format(avg_score, num_episodes))
        if save_result:
            played_result_path = os.path.join("experiments",
                                              self.config['exp_name'], "runs",
                                              "play_score.pth")
            torch.save(scores, played_result_path)

        if save_traj:
            demo_dir = os.path.join("experiments", self.config['exp_name'],
                                    "demonstration")
            os.makedirs(demo_dir)
            torch.save(trajectories, os.path.join(demo_dir, "demo.pth"))
            print("saved {} trajectories.".format(num_episodes))

        env.close()
Exemplo n.º 17
0
class TD3:
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 max_action,
                 gamma=0.99,
                 tau=0.005,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2):
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.actor.to(self.device)
        self.actor_target.to(self.device)
        self.critic.to(self.device)
        self.critic_target.to(self.device)

        self.env = env
        self.total_it = 0

    def select_action(self, state, noise=0.1):
        action = self.actor(state.to(self.device)).data.cpu().numpy().flatten()
        if noise != 0:
            action = (action + np.random.normal(
                0, noise, size=self.env.action_space.shape[0]))

        return action.clip(self.env.action_space.low,
                           self.env.action_space.high)

    def train(self, replay_buffer, batch_size=128):
        self.total_it += 1

        states, states_, actions, rewards, terminal = replay_buffer.sample_buffer(
            batch_size)

        with torch.no_grad():
            noise = (torch.randn_like(actions.to(self.device)) *
                     self.policy_noise).clamp(-self.noise_clip,
                                              self.noise_clip)

            next_action = (self.actor_target(states_.to(self.device)) +
                           noise).clamp(-self.max_action, self.max_action)

            # compute the target Q value
            target_q1, target_q2 = self.critic_target(
                states_.to(self.device), next_action.to(self.device))
            target_q = torch.min(target_q1, target_q2)
            # target_q = rewards + terminal * self.gamma + target_q.cpu()
            # target_q = rewards + (terminal.reshape(256, 1) * self.gamma * target_q).detach()
            target_q = rewards + terminal * self.gamma * target_q[:, 0].cpu()

        # Get current Q value
        current_q1, current_q2 = self.critic(states.to(self.device),
                                             actions.to(self.device))

        # Compute critic loss
        critic_loss = F.mse_loss(current_q1[:, 0], target_q.to(
            self.device)) + F.mse_loss(current_q2[:, 0],
                                       target_q.to(self.device))

        # optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # Compote actor loss
            actor_loss = -self.critic.q1(states.to(
                self.device), self.actor(states.to(self.device))).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(),
                   filename + "_critic_optimizer")
        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(),
                   filename + "_actor_optimizer")

    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(
            torch.load(filename + "_critic_optimizer"))
        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(
            torch.load(filename + "_actor_optimizer"))
Exemplo n.º 18
0
class Agent():
    def __init__(self, learn_rate, input_shape, num_actions):
        self.num_actions = num_actions
        self.gamma = 0.99
        self.critic_update_max = 20
        self.actor_update_max = 10
        self.memories = []

        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.actor = Actor().to(self.device)
        self.critic = Critic().to(self.device)

        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=learn_rate)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=learn_rate)

    def choose_action(self, state, hidden_state):
        state = torch.tensor(state, dtype=torch.float32).to(self.device)

        policy, hidden_state_ = self.actor(state, hidden_state)
        policy = F.softmax(policy)
        actions_probs = torch.distributions.Categorical(policy)
        action = actions_probs.sample()
        action_log_prob = actions_probs.log_prob(action).unsqueeze(0)
        # action = torch.argmax(policy)

        #   prep for storage
        action = action.item()

        return action, policy, hidden_state_, action_log_prob

    def store_memory(self, memory):
        self.memories.append(memory)

    def get_discounted_cum_rewards(self, memory):
        cum_rewards = []
        total = 0
        for reward in reversed(memory.rewards):
            total = reward + total * self.gamma
            cum_rewards.append(total)
        cum_rewards = list(reversed(cum_rewards))
        cum_disc_rewards = torch.tensor(cum_rewards).float().to(self.device)

        return cum_rewards

    def learn(self):
        critic_losses = []
        for memory_idx, memory in enumerate(self.memories):
            print(memory_idx)
            states, actions, policies, rewards, dones, actor_hidden_states, action_log_probs = \
                memory.fetch_on_device(self.device)
            cum_disc_rewards = self.get_discounted_cum_rewards(memory)
            ''' train critic    '''
            self.critic.train()
            self.actor.eval()

            critic_hidden_state = self.critic.get_new_hidden_state()
            for i in range(len(memory.states)):
                state = states[i].detach()
                policy = policies[i].detach()
                action_log_prob = action_log_probs[i].detach()
                done = dones[i].detach()

                true_value = cum_disc_rewards[i]

                value, critic_hidden_state_ = self.critic(
                    state, action_log_prob, critic_hidden_state)
                if done:
                    true_value *= 0.0
                error = value - true_value
                # print("true: {}, value: {}".format(true_value, value))
                critic_loss = error**2
                if critic_loss >= self.critic_update_max:
                    print("critic_loss BIG: {}".format(critic_loss))
                critic_loss = torch.clamp(critic_loss, -self.critic_update_max,
                                          self.critic_update_max)
                critic_losses.append(critic_loss)

                critic_hidden_state = critic_hidden_state_

        # print("end")
        all_critic_loss = sum(critic_losses)
        # all_critic_loss = torch.stack(critic_losses).mean()
        self.critic_optimizer.zero_grad()
        all_critic_loss.backward()
        self.critic_optimizer.step()

        actor_losses = []
        for memory_idx, memory in enumerate(self.memories):
            print(memory_idx)
            states, actions, policies, rewards, dones, actor_hidden_states, action_log_probs = \
                memory.fetch_on_device(self.device)
            ''' train actor     '''
            self.critic.eval()
            self.actor.train()

            critic_hidden_state = self.critic.get_new_hidden_state()
            for i in range(len(memory.states)):
                state = states[i].detach()
                # policy = policies[i]
                action_log_prob = action_log_probs[i]
                critic_hidden_state = critic_hidden_state.detach()
                done = dones[i].detach()

                value, critic_hidden_state_ = self.critic(
                    state, action_log_prob, critic_hidden_state)
                if done:
                    value *= 0.0
                # print("true: {}, value: {}".format(true_value, value))
                actor_loss = value
                if actor_loss >= self.actor_update_max:
                    print("actor_loss BIG: {}".format(actor_loss))
                actor_loss = torch.clamp(actor_loss, -self.actor_update_max,
                                         self.actor_update_max)
                actor_losses.append(actor_loss)

                critic_hidden_state = critic_hidden_state_

        all_actor_loss = sum(actor_losses)
        # all_actor_loss = torch.stack(actor_losses).mean()
        self.actor_optimizer.zero_grad()
        all_actor_loss.backward()
        self.actor_optimizer.step()
Exemplo n.º 19
0
def train(BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE,
          MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL,
          UPDATE_INTERVAL, UPDATE_START, ENV, OBSERVATION_LOW, VALUE_FNC,
          FLOW_TYPE, FLOWS, DEMONSTRATIONS, PRIORITIZE_REPLAY,
          BEHAVIOR_CLONING, ARM, BASE, RPA, REWARD_DENSE, logdir):

    ALPHA = 0.3
    BETA = 1
    epsilon = 0.0001  #0.1
    epsilon_d = 0.1  #0.3
    weights = 1  #1
    lambda_ac = 0.85  #0.7
    lambda_bc = 0.3  #0.4

    setup_logger(logdir, locals())
    ENV = __import__(ENV)
    if ARM and BASE:
        env = ENV.youBotAll('youbot_navig2.ttt',
                            obs_lowdim=OBSERVATION_LOW,
                            rpa=RPA,
                            reward_dense=REWARD_DENSE,
                            boundary=1)
    elif ARM:
        env = ENV.youBotArm('youbot_navig.ttt',
                            obs_lowdim=OBSERVATION_LOW,
                            rpa=RPA,
                            reward_dense=REWARD_DENSE)
    elif BASE:
        env = ENV.youBotBase('youbot_navig.ttt',
                             obs_lowdim=OBSERVATION_LOW,
                             rpa=RPA,
                             reward_dense=REWARD_DENSE,
                             boundary=1)

    action_space = env.action_space
    obs_space = env.observation_space()
    step_limit = env.step_limit()

    if OBSERVATION_LOW:
        actor = SoftActorGated(HIDDEN_SIZE,
                               action_space,
                               obs_space,
                               flow_type=FLOW_TYPE,
                               flows=FLOWS).float().to(device)
        critic_1 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_2 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
    else:
        actor = ActorImageNet(HIDDEN_SIZE,
                              action_space,
                              obs_space,
                              flow_type=FLOW_TYPE,
                              flows=FLOWS).float().to(device)
        critic_1 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_2 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_1.load_state_dict(
            torch.load(
                'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl'
            ))
        critic_2.load_state_dict(
            torch.load(
                'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl'
            ))

    actor.apply(weights_init)
    # critic_1.apply(weights_init)
    # critic_2.apply(weights_init)

    if VALUE_FNC:
        value_critic = Critic(HIDDEN_SIZE, 1, obs_space,
                              action_space).float().to(device)
        target_value_critic = create_target_network(value_critic).float().to(
            device)
        value_critic_optimiser = optim.Adam(value_critic.parameters(),
                                            lr=LEARNING_RATE)
    else:
        target_critic_1 = create_target_network(critic_1)
        target_critic_2 = create_target_network(critic_2)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critics_optimiser = optim.Adam(list(critic_1.parameters()) +
                                   list(critic_2.parameters()),
                                   lr=LEARNING_RATE)

    # Replay buffer
    if PRIORITIZE_REPLAY:
        # D = PrioritizedReplayBuffer(REPLAY_SIZE, ALPHA)
        D = ReplayMemory(device, 3, DISCOUNT, 1, BETA, ALPHA, REPLAY_SIZE)
    else:
        D = deque(maxlen=REPLAY_SIZE)

    eval_ = evaluation_sac(env, logdir, device)

    #Automatic entropy tuning init
    target_entropy = -np.prod(action_space).item()
    log_alpha = torch.zeros(1, requires_grad=True, device=device)
    alpha_optimizer = optim.Adam([log_alpha], lr=LEARNING_RATE)

    home = os.path.expanduser('~')
    if DEMONSTRATIONS:
        dir_dem = os.path.join(home, 'robotics_drl/data/demonstrations/',
                               DEMONSTRATIONS)
        D, n_demonstrations = load_buffer_demonstrations(
            D, dir_dem, PRIORITIZE_REPLAY, OBSERVATION_LOW)
    else:
        n_demonstrations = 0

    if not BEHAVIOR_CLONING:
        behavior_loss = 0

    os.mkdir(os.path.join(home, 'robotics_drl', logdir, 'models'))
    dir_models = os.path.join(home, 'robotics_drl', logdir, 'models')

    state, done = env.reset(), False
    if OBSERVATION_LOW:
        state = state.float().to(device)
    else:
        state['low'] = state['low'].float()
        state['high'] = state['high'].float()
    pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0)

    steps = 0
    success = 0
    for step in pbar:
        with torch.no_grad():
            if step < UPDATE_START and not DEMONSTRATIONS:
                # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training
                action = torch.tensor(env.sample_action(),
                                      dtype=torch.float32,
                                      device=device).unsqueeze(dim=0)
            else:
                # Observe state s and select action a ~ μ(a|s)
                if not OBSERVATION_LOW:
                    state['low'] = state['low'].float().to(device)
                    state['high'] = state['high'].float().to(device)
                action, _ = actor(state, log_prob=False, deterministic=False)
                if not OBSERVATION_LOW:
                    state['low'] = state['low'].float().cpu()
                    state['high'] = state['high'].float().cpu()
                #if (policy.mean).mean() > 0.4:
                #    print("GOOD VELOCITY")
            # Execute a in the environment and observe next state s', reward r, and done signal d to indicate whether s' is terminal
            next_state, reward, done = env.step(
                action.squeeze(dim=0).cpu().tolist())
            if OBSERVATION_LOW:
                next_state = next_state.float().to(device)
            else:
                next_state['low'] = next_state['low'].float()
                next_state['high'] = next_state['high'].float()
            # Store (s, a, r, s', d) in replay buffer D
            if PRIORITIZE_REPLAY:
                if OBSERVATION_LOW:
                    D.add(state.cpu().tolist(),
                          action.cpu().squeeze().tolist(), reward,
                          next_state.cpu().tolist(), done)
                else:
                    D.append(state['high'], state['low'],
                             action.cpu().squeeze().tolist(), reward, done)
            else:
                D.append({
                    'state':
                    state.unsqueeze(dim=0) if OBSERVATION_LOW else state,
                    'action':
                    action,
                    'reward':
                    torch.tensor([reward], dtype=torch.float32, device=device),
                    'next_state':
                    next_state.unsqueeze(
                        dim=0) if OBSERVATION_LOW else next_state,
                    'done':
                    torch.tensor([True if reward == 1 else False],
                                 dtype=torch.float32,
                                 device=device)
                })

            state = next_state

            # If s' is terminal, reset environment state
            steps += 1

            if done or steps > step_limit:  #TODO: incorporate step limit in the environment
                eval_c2 = True  #TODO: multiprocess pyrep with a session for each testing and training
                steps = 0
                if OBSERVATION_LOW:
                    state = env.reset().float().to(device)
                else:
                    state = env.reset()
                    state['low'] = state['low'].float()
                    state['high'] = state['high'].float()
                if reward == 1:
                    success += 1

        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            for _ in range(1):
                # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D
                if PRIORITIZE_REPLAY:
                    if OBSERVATION_LOW:
                        state_batch, action_batch, reward_batch, state_next_batch, done_batch, weights_pr, idxes = D.sample(
                            BATCH_SIZE, BETA)
                        state_batch = torch.from_numpy(state_batch).float().to(
                            device)
                        next_state_batch = torch.from_numpy(
                            state_next_batch).float().to(device)
                        action_batch = torch.from_numpy(
                            action_batch).float().to(device)
                        reward_batch = torch.from_numpy(
                            reward_batch).float().to(device)
                        done_batch = torch.from_numpy(done_batch).float().to(
                            device)
                        weights_pr = torch.from_numpy(weights_pr).float().to(
                            device)
                    else:
                        idxes, high_state_batch, low_state_batch, action_batch, reward_batch, high_state_next_batch, low_state_next_batch, done_batch, weights_pr = D.sample(
                            BATCH_SIZE)

                        state_batch = {
                            'low':
                            low_state_batch.float().to(device).view(-1, 32),
                            'high':
                            high_state_batch.float().to(device).view(
                                -1, 12, 128, 128)
                        }
                        next_state_batch = {
                            'low':
                            low_state_next_batch.float().to(device).view(
                                -1, 32),
                            'high':
                            high_state_next_batch.float().to(device).view(
                                -1, 12, 128, 128)
                        }

                        action_batch = action_batch.float().to(device)
                        reward_batch = reward_batch.float().to(device)
                        done_batch = done_batch.float().to(device)
                        weights_pr = weights_pr.float().to(device)
                        # for j in range(BATCH_SIZE):
                        #     new_state_batch['high'] = torch.cat((new_state_batch['high'], state_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0)
                        #     new_state_batch['low'] = torch.cat((new_state_batch['low'], state_batch[j].tolist()['low'].view(-1,32)), dim=0)
                        #     new_next_state_batch['high'] = torch.cat((new_next_state_batch['high'], state_next_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0)
                        #     new_next_state_batch['low'] = torch.cat((new_next_state_batch['low'], state_next_batch[j].tolist()['low'].view(-1,32)), dim=0)
                        # new_state_batch['high'] = new_state_batch['high'].to(device)
                        # new_state_batch['low'] = new_state_batch['low'].to(device)
                        # new_next_state_batch['high'] = new_next_state_batch['high'].to(device)
                        # new_next_state_batch['low'] = new_next_state_batch['low'].to(device)

                    batch = {
                        'state': state_batch,
                        'action': action_batch,
                        'reward': reward_batch,
                        'next_state': next_state_batch,
                        'done': done_batch
                    }
                    state_batch = []
                    state_next_batch = []

                else:
                    batch = random.sample(D, BATCH_SIZE)
                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    state_next_batch = []
                    done_batch = []
                    for d in batch:
                        state_batch.append(d['state'])
                        action_batch.append(d['action'])
                        reward_batch.append(d['reward'])
                        state_next_batch.append(d['next_state'])
                        done_batch.append(d['done'])

                    batch = {
                        'state': torch.cat(state_batch, dim=0),
                        'action': torch.cat(action_batch, dim=0),
                        'reward': torch.cat(reward_batch, dim=0),
                        'next_state': torch.cat(state_next_batch, dim=0),
                        'done': torch.cat(done_batch, dim=0)
                    }

                action, log_prob = actor(batch['state'],
                                         log_prob=True,
                                         deterministic=False)

                #Automatic entropy tuning
                alpha_loss = -(
                    log_alpha.float() *
                    (log_prob + target_entropy).float().detach()).mean()
                alpha_optimizer.zero_grad()
                alpha_loss.backward()
                alpha_optimizer.step()
                alpha = log_alpha.exp()
                weighted_sample_entropy = (alpha.float() * log_prob).view(
                    -1, 1)

                # Compute targets for Q and V functions
                if VALUE_FNC:
                    y_q = batch['reward'] + DISCOUNT * (
                        1 - batch['done']) * target_value_critic(
                            batch['next_state'])
                    y_v = torch.min(
                        critic_1(batch['state']['low'], action.detach()),
                        critic_2(batch['state']['low'], action.detach())
                    ) - weighted_sample_entropy.detach()
                else:
                    # No value function network
                    with torch.no_grad():
                        next_actions, next_log_prob = actor(
                            batch['next_state'],
                            log_prob=True,
                            deterministic=False)
                        target_qs = torch.min(
                            target_critic_1(
                                batch['next_state']['low'] if
                                not OBSERVATION_LOW else batch['next_state'],
                                next_actions),
                            target_critic_2(
                                batch['next_state']['low'] if
                                not OBSERVATION_LOW else batch['next_state'],
                                next_actions)) - alpha * next_log_prob
                    y_q = batch['reward'] + DISCOUNT * (
                        1 - batch['done']) * target_qs.detach()

                td_error_critic1 = critic_1(
                    batch['state']['low'] if not OBSERVATION_LOW else
                    batch['state'], batch['action']) - y_q
                td_error_critic2 = critic_2(
                    batch['state']['low'] if not OBSERVATION_LOW else
                    batch['state'], batch['action']) - y_q

                q_loss = (td_error_critic1).pow(2).mean() + (
                    td_error_critic2).pow(2).mean()
                # q_loss = (F.mse_loss(critic_1(batch['state'], batch['action']), y_q) + F.mse_loss(critic_2(batch['state'], batch['action']), y_q)).mean()
                critics_optimiser.zero_grad()
                q_loss.backward()
                critics_optimiser.step()

                # Compute priorities, taking demonstrations into account
                if PRIORITIZE_REPLAY:
                    td_error = weights_pr * (td_error_critic1.detach() +
                                             td_error_critic2.detach()).mean()
                    action_dem = torch.tensor([]).to(device)
                    if OBSERVATION_LOW:
                        state_dem = torch.tensor([]).to(device)
                    else:
                        state_dem = {
                            'low': torch.tensor([]).float().to(device),
                            'high': torch.tensor([]).float().to(device)
                        }
                    priorities = torch.abs(td_error).tolist()
                    i = 0
                    count_dem = 0
                    for idx in idxes:
                        priorities[i] += epsilon
                        if idx < n_demonstrations:
                            priorities[i] += epsilon_d
                            count_dem += 1
                            if BEHAVIOR_CLONING:
                                action_dem = torch.cat(
                                    (action_dem, batch['action'][i].view(
                                        1, -1)),
                                    dim=0)
                                if OBSERVATION_LOW:
                                    state_dem = torch.cat(
                                        (state_dem, batch['state'][i].view(
                                            1, -1)),
                                        dim=0)
                                else:
                                    state_dem['high'] = torch.cat(
                                        (state_dem['high'],
                                         batch['state']['high'][i, ].view(
                                             -1,
                                             (3 + 1) * env.frames, 128, 128)),
                                        dim=0)
                                    state_dem['low'] = torch.cat(
                                        (state_dem['low'],
                                         batch['state']['low'][i, ].view(
                                             -1, 32)),
                                        dim=0)
                        i += 1
                    if not action_dem.nelement() == 0:
                        actual_action_dem, _ = actor(state_dem,
                                                     log_prob=False,
                                                     deterministic=True)
                        # q_value_actor = (critic_1(batch['state'][i], batch['action'][i]) + critic_2(batch['state'][i], batch['action'][i]))/2
                        # q_value_actual = (critic_1(batch['state'][i], actual_action_dem) + critic_2(batch['state'][i], actual_action_dem))/2
                        # if q_value_actor > q_value_actual: # Q Filter
                        behavior_loss = F.mse_loss(
                            action_dem, actual_action_dem).unsqueeze(dim=0)
                    else:
                        behavior_loss = 0

                    D.update_priorities(idxes, priorities)
                lambda_bc = (count_dem / BATCH_SIZE) / 5

                # Update V-function by one step of gradient descent
                if VALUE_FNC:
                    v_loss = (value_critic(batch['state']) -
                              y_v).pow(2).mean().to(device)

                    value_critic_optimiser.zero_grad()
                    v_loss.backward()
                    value_critic_optimiser.step()

                # Update policy by one step of gradient ascent
                with torch.no_grad():
                    new_qs = torch.min(
                        critic_1(
                            batch["state"]['low'] if not OBSERVATION_LOW else
                            batch['state'], action),
                        critic_2(
                            batch["state"]['low'] if not OBSERVATION_LOW else
                            batch['state'], action))
                policy_loss = lambda_ac * (weighted_sample_entropy.view(
                    -1) - new_qs).mean().to(device) + lambda_bc * behavior_loss
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Update target value network
                if VALUE_FNC:
                    update_target_network(value_critic, target_value_critic,
                                          POLYAK_FACTOR)
                else:
                    update_target_network(critic_1, target_critic_1,
                                          POLYAK_FACTOR)
                    update_target_network(critic_2, target_critic_2,
                                          POLYAK_FACTOR)
        state_dem = []

        # Continues to sample transitions till episode is done and evaluation is on
        if step > UPDATE_START and step % TEST_INTERVAL == 0: eval_c = True
        else: eval_c = False

        if eval_c == True and eval_c2 == True:
            eval_c = False
            eval_c2 = False
            actor.eval()
            critic_1.eval()
            critic_2.eval()
            q_value_eval = eval_.get_qvalue(critic_1, critic_2)
            return_ep, steps_ep = eval_.sample_episode(actor)

            logz.log_tabular('Training steps', step)
            logz.log_tabular('Cumulative Success', success)
            logz.log_tabular('Validation return', return_ep.mean())
            logz.log_tabular('Validation steps', steps_ep.mean())
            logz.log_tabular('Validation return std', return_ep.std())
            logz.log_tabular('Validation steps std', steps_ep.std())
            logz.log_tabular('Q-value evaluation', q_value_eval)
            logz.log_tabular('Q-network loss', q_loss.detach().cpu().numpy())
            if VALUE_FNC:
                logz.log_tabular('Value-network loss',
                                 v_loss.detach().cpu().numpy())
            logz.log_tabular('Policy-network loss',
                             policy_loss.detach().cpu().squeeze().numpy())
            logz.log_tabular('Alpha loss', alpha_loss.detach().cpu().numpy())
            logz.log_tabular('Alpha', alpha.detach().cpu().squeeze().numpy())
            logz.log_tabular('Demonstrations current batch', count_dem)
            logz.dump_tabular()

            logz.save_pytorch_model(actor.state_dict())

            torch.save(actor.state_dict(),
                       os.path.join(dir_models, 'actor_model_%s.pkl' % (step)))
            torch.save(
                critic_1.state_dict(),
                os.path.join(dir_models, 'critic1_model_%s.pkl' % (step)))
            torch.save(
                critic_2.state_dict(),
                os.path.join(dir_models, 'critic1_model_%s.pkl' % (step)))

            #pbar.set_description('Step: %i | Reward: %f' % (step, return_ep.mean()))

            actor.train()
            critic_1.train()
            critic_2.train()

    env.terminate()
Exemplo n.º 20
0
class Agent(object):
    '''
    Implementation of a DQN agent that interacts with and learns from the
    environment
    '''
    def __init__(self, state_size, action_size, rand_seed, meta_agent):
        '''Initialize an MetaAgent object.
        :param state_size: int. dimension of each state
        :param action_size: int. dimension of each action
        :param nb_agents: int. number of agents to use
        :param rand_seed: int. random seed
        :param memory: ReplayBuffer object.
        '''

        self.action_size = action_size
        self.__name__ = 'DDPG'

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, rand_seed).to(DEVC)
        self.actor_target = Actor(state_size, action_size, rand_seed).to(DEVC)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   meta_agent.nb_agents, rand_seed).to(DEVC)
        self.critic_target = Critic(state_size, action_size,
                                    meta_agent.nb_agents, rand_seed).to(DEVC)
        # NOTE: the decay corresponds to L2 regularization
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=LR_CRITIC)  # , weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, rand_seed)

        # Replay memory
        self.memory = meta_agent.memory

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, others_states,
             others_actions, others_next_states):
        self.memory.add(state, action, reward, next_state, done, others_states,
                        others_actions, others_next_states)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # source: Sample a random minibatch of N transitions from R
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        '''Returns actions for given states as per current policy.
        :param states: array_like. current states
        :param add_noise: Boolean. If should add noise to the action
        '''
        states = torch.from_numpy(states).float().to(DEVC)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        '''
        Update policy and value params using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        :param experiences: Tuple[torch.Tensor]. tuple of (s, a, r, s', done)
        :param gamma: float. discount factor
        '''
        (states, actions, rewards, next_states, dones, others_states,
         others_actions, others_next_states) = experiences
        # rewards_ = torch.clamp(rewards, min=-1., max=1.)
        rewards_ = rewards
        all_states = torch.cat((states, others_states), dim=1).to(DEVC)
        all_actions = torch.cat((actions, others_actions), dim=1).to(DEVC)
        all_next_states = torch.cat((next_states, others_next_states),
                                    dim=1).to(DEVC)

        # --------------------------- update critic ---------------------------
        # Get predicted next-state actions and Q values from target models
        l_all_next_actions = []
        l_all_next_actions.append(self.actor_target(states))
        l_all_next_actions.append(self.actor_target(others_states))
        all_next_actions = torch.cat(l_all_next_actions, dim=1).to(DEVC)

        Q_targets_next = self.critic_target(all_next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards_ + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss: L = 1/N SUM{(yi ? Q(si, ai|?Q))^2}
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # --------------------------- update actor ---------------------------
        # Compute actor loss
        this_actions_pred = self.actor_local(states)
        others_actions_pred = self.actor_local(others_states)
        others_actions_pred = others_actions_pred.detach()
        actions_pred = torch.cat((this_actions_pred, others_actions_pred),
                                 dim=1).to(DEVC)
        actor_loss = -self.critic_local(all_states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------------- update target networks ----------------------
        # Update the critic target networks
        # Update the actor target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        '''Soft update model parameters.
        ?_target = ?*?_local + (1 - ?)*?_target
        :param local_model: PyTorch model. weights will be copied from
        :param target_model: PyTorch model. weights will be copied to
        :param tau: float. interpolation parameter
        '''
        iter_params = zip(target_model.parameters(), local_model.parameters())
        for target_param, local_param in iter_params:
            tensor_aux = tau * local_param.data + (1.0 -
                                                   tau) * target_param.data
            target_param.data.copy_(tensor_aux)

    def reset(self):
        self.noise.reset()
Exemplo n.º 21
0
from torch import optim
from tqdm import tqdm
from env import Env
from models import Actor, Critic, create_target_network, update_target_network
from utils import plot

max_steps, update_start, update_interval, batch_size, discount, policy_delay, polyak_rate = 100000, 10000, 4, 128, 0.99, 2, 0.995
env = Env()
actor = Actor()
critic_1 = Critic(state_action=True)
critic_2 = Critic(state_action=True)
target_actor = create_target_network(actor)
target_critic_1 = create_target_network(critic_1)
target_critic_2 = create_target_network(critic_2)
actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3)
critics_optimiser = optim.Adam(list(critic_1.parameters()) +
                               list(critic_2.parameters()),
                               lr=1e-3)
D = deque(maxlen=10000)

state, done, total_reward = env.reset(), False, 0
pbar = tqdm(range(1, max_steps + 1), unit_scale=1, smoothing=0)
for step in pbar:
    with torch.no_grad():
        if step < update_start:
            # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training
            action = torch.tensor([[2 * random.random() - 1]])
        else:
            # Observe state s and select action a = clip(μ(s) + ε, a_low, a_high)
            action = torch.clamp(actor(state) + 0.1 * torch.randn(1, 1),
                                 min=-1,
Exemplo n.º 22
0
class agent(object):
    def __init__(self, state_size, action_size, num_agents, lr, seed):
        """ 
        Twin Delayed DDPG agent.

        Arguments:
        state_size : Size of the state from environment.
        action_size : Size of the action taken by the agent
        num_agents : Total number of agents
        lr : Common learning rate for both agents
        seed : Seed value for reproducibility
        """
        #Initialization of local and target actor networks
        self.actor = TD3Policy(state_size, action_size, seed)
        self.actor_opt = opt.Adam(self.actor.parameters(), lr=lr)
        self.actor_target = TD3Policy(state_size, action_size, seed)

        #Initialization of local and target critic networks
        self.critic = Critic(state_size, action_size, seed)
        self.critic_opt = opt.Adam(self.critic.parameters(), lr=lr)
        self.critic_target = Critic(state_size, action_size, seed)

        #Agent hyper parameters
        self.num_agents = num_agents
        self.policy_update = 2
        self.step = 0
        self.noise_clip = 0.5
        self.policy_noise = 0.2
        self.gamma = 0.998
        self.TAU = 0.005
        self.batch_size = 64

        #Replay buffer for 'COMPETE' mode
        self.memory = ReplayBuffer(int(1e6), self.batch_size)

        #Hard updation of target network
        self.hard_update(self.critic, self.critic_target)

    def act(self, state, add_noise=True):
        """
        Returns action for the given state

        Argument:
        state : State vector containing state variables
        """
        state = torch.from_numpy(state).float()
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.numpy()
        self.actor.train()
        return actions

    def update(self, experience):
        """
        Updates the agent's replay buffer and trains the agent
        in 'COMPETE' mode.

        Arguments:
        experience : Tuple containing current experience
        """
        self.memory.add(experience)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.train(experiences)

    def train(self, observations):
        """
        Trains the agent using the experiences

        Arguments:
        experience : Tuple containing current experience
        """
        states, actions, rewards, next_states, dones = observations
        #Step is updated at each timestep for policy updation
        self.step = (self.step + 1) % self.policy_update
        """
        Random noise is added to the Q_target. This encourages the 
        agent to explore more. Minimum of the clipped Q values is
        used to reduce the overestimation behaviour of DDPG
        """
        with torch.no_grad():
            noise = (torch.randn_like(actions) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)
            next_actions = (self.actor_target(next_states) + noise).clamp(
                -1, 1)
            Q1_target, Q2_target = self.critic_target(next_states,
                                                      next_actions)
            Q_target = torch.min(Q1_target, Q2_target)
            Q_target = rewards + (self.gamma * Q_target * (1 - dones))

        Q1_expected, Q2_expected = self.critic(states, actions)

        critic_loss = F.mse_loss(Q1_expected, Q_target) + F.mse_loss(
            Q2_expected, Q_target)

        #Updating local critic network
        self.critic_opt.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_opt.step()
        """
        Policy is updated every self.policy_update timesteps.
        ie., if the policy update is set to 2, Policy network is updated 
        every two times the updation of critic network.
        This stabilizes the learning and overestimation.
        """
        if self.step == 0:
            expected_actions = self.actor(states)
            actor_loss = -self.critic.Q1(states, expected_actions).mean()

            #Updating local
            self.actor_opt.zero_grad()
            actor_loss.backward()
            self.actor_opt.step()

            #Soft update of actor and critic target network using TAU
            self.soft_update(self.actor, self.actor_target)
            self.soft_update(self.critic, self.critic_target)

    def soft_update(self, local, target):
        """ 
        Soft update of target network parameters using TAU
        """
        for l, t in zip(local.parameters(), target.parameters()):
            t.data.copy_(self.TAU * l.data + (1 - self.TAU) * t.data)

    def hard_update(self, local, target):
        """
        Hard update which copies local network parameters to target network
        """
        for l, t in zip(local.parameters(), target.parameters()):
            t.data.copy_(l.data)
Exemplo n.º 23
0
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True

    train_loader, vocab = load(args.batch_size, args.seq_len)

    autoencoder = Autoencoder(args.enc_hidden_dim, args.dec_hidden_dim,
                              args.embedding_dim, args.latent_dim,
                              vocab.size(), args.dropout, args.seq_len)
    autoencoder.load_state_dict(
        torch.load('autoencoder.th', map_location=lambda x, y: x))
    generator = Generator(args.n_layers, args.block_dim)
    critic = Critic(args.n_layers, args.block_dim)

    g_optimizer = optim.Adam(generator.parameters(), lr=args.lr)
    c_optimizer = optim.Adam(critic.parameters(), lr=args.lr)

    if args.cuda:
        autoencoder = autoencoder.cuda()
        generator = generator.cuda()
        critic = critic.cuda()

    print('G Parameters:', sum([p.numel() for p in generator.parameters() if \
                                p.requires_grad]))
    print('C Parameters:', sum([p.numel() for p in critic.parameters() if \
                                p.requires_grad]))

    best_loss = np.inf

    for epoch in range(1, args.epochs + 1):
        g_loss, c_loss = train(epoch)
Exemplo n.º 24
0
class DDPGAgent:
    def __init__(self, env, gamma, tau, buffer_maxlen, batch_size,
                 critic_learning_rate, actor_learning_rate, update_per_step,
                 seed):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # hyperparameters
        self.num_replay_updates_per_step = update_per_step
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0], seed).to(self.device)
        self.critic_target = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0],
                                    seed).to(self.device)

        self.actor = Actor(env.observation_space.shape[0],
                           env.action_space.shape[0], seed).to(self.device)
        self.actor_target = Actor(env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  seed).to(self.device)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.buffer = ReplayBuffer(buffer_maxlen, batch_size, seed)
        self.noise = OUNoise(env.action_space.shape[0])

    def get_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state)
        self.actor.train()

        action = action.cpu().numpy()
        return action

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay buffer
        self.buffer.add(state, action, reward, next_state, done)

        q_loss, policy_loss = None, None
        # If enough samples are available in buffer, get random subset and learn
        if len(self.buffer) >= self.batch_size:
            # update the network "num_replay_updates_per_step" times in each step
            for _ in range(self.num_replay_updates_per_step):
                experiences = self.buffer.sample()
                q_loss, policy_loss = self.learn(experiences)
                q_loss = q_loss.detach().item()
                policy_loss = policy_loss.detach().item()

        return q_loss, policy_loss

    def learn(self, experiences):
        """Updating actor and critic parameters based on sampled experiences from replay buffer."""
        states, actions, rewards, next_states, dones = experiences

        curr_Q = self.critic(states, actions)
        next_actions = self.actor_target(next_states).detach()
        next_Q = self.critic_target(next_states, next_actions).detach()
        target_Q = rewards + self.gamma * next_Q * (1 - dones)

        # losses
        q_loss = F.mse_loss(curr_Q, target_Q)
        policy_loss = -self.critic(states, self.actor(states)).mean()

        # update actor
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update critic
        self.critic_optimizer.zero_grad()
        q_loss.backward()
        self.critic_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))
        return q_loss, policy_loss
Exemplo n.º 25
0
class NPG:
    def __init__(self, obs_space, action_space, hidden_dim=64):
        self.npg = NPGNetwork(obs_space, action_space, hidden_dim)
        # self.actor = Actor(obs_space, action_space, hidden_size=hidden_dim)
        # self.q = Critic(obs_space, action_space=action_space, hidden_size=hidden_dim)
        self.v = Critic(obs_space, action_space=1, hidden_size=hidden_dim)
        self.adv = Critic(obs_space,
                          action_space=action_space,
                          hidden_size=hidden_dim)
        #self.opt_adv = PKTDOptimizer(
        #    list(self.adv.parameters())
        #)
        self.opt_critic = optim.Adam(self.v.parameters())
        self.opt_adv = optim.Adam(self.adv.parameters())

        self.tau = 1.
        self.beta = 10.
        self.gamma = .99
        self.name = f"npg"
        self.batch_stats = RunningMeanStd()

    def act(self, s):
        with torch.no_grad():
            pi = self.npg(s)
            v = self.v(s).squeeze()
            a = pi.sample().squeeze()
            return a, v, 0

    def update(self, batch):
        s, a, r, s1, done, w, R = batch

        v_next = self.v(s1).squeeze()
        v = self.v(s).squeeze()
        adv = self.adv(s)
        a = F.one_hot(a, adv.shape[1])
        adv = (a * adv).sum(dim=1)

        #(adv + v - self.gamma * (1 - done) * v_next).mean().backward()  # gradient
        #innovation = (r - adv - self.gamma * (1 - done) * v_next).mean().detach()

        self.opt_adv.zero_grad()
        td = r + (1 - done) * self.gamma * v_next - v
        adv_loss = (.5 * (td.detach() - adv).pow(2)).mean()
        adv_loss.backward()
        self.opt_adv.step()

        #adv.mean().backward()
        #adv_norm = torch.stack([p.grad.norm() for  p in self.adv.parameters()]).norm()
        #td = r + v_next - v
        #innovation = (td - adv).mean().detach()
        #opt_stats = self.opt_adv.step(innovation=innovation)
        td_loss = .5 * (td.pow(2)).mean()
        #td = r + v_next - v
        #adv_loss = (.5 * (td.detach() - adv).pow(2)).mean()
        #td_loss = .5 * (td.pow(2).mean())

        self.opt_critic.zero_grad()
        td_loss.backward()
        self.opt_critic.step()

        with torch.no_grad():
            pi_old = self.npg(s)
        _update_target_soft_(self.npg.parameters(),
                             src=self.adv.parameters(),
                             tau=1e-4)

        with torch.no_grad():
            pi = self.npg(s)
        kl = torch.distributions.kl_divergence(pi, pi_old)

        stats = {
            "pi/q_loss": adv_loss,
            #"pi/innovation":innovation,
            "pi/v_loss": td_loss,
            "pi/entropy": pi.entropy().mean(),
            "pi/kl": kl.mean(),
            #"pi/adv_grad_norm":adv_norm
        }
        #stats.update(opt_stats)
        return stats
Exemplo n.º 26
0
class DDPG():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed, hyper, num_agents, memory):

        self.action_size = action_size
        self.num_agents  = num_agents
    
        # Actor Network (w/ Target Network)
        self.actor_local     = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target    = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyper['LR_ACTOR'])

        # Critic Network (w/ Target Network)
        self.critic_local     = Critic(state_size, action_size, num_agents, random_seed).to(device)
        self.critic_target    = Critic(state_size, action_size, num_agents, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=hyper['LR_CRITIC']) #, weight_decay=hyper['WEIGHT_DECAY'])

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        self.t           = 0 
        self.memory      = memory

    def step(self, state, action, reward, next_state, done, others_states,others_actions, others_next_states):
        self.memory.add(state, action, reward, next_state, done, others_states, others_actions, others_next_states)
        self.t = (self.t + 1) % hyper['UPDATE_EVERY']
        if self.t == 0:
            if len(self.memory) > hyper['BATCH_SIZE']:
                experiences = self.memory.sample()
                self.learn(experiences, hyper['GAMMA'])

    def act(self, states, add_noise=True):
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)
    
    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        (states, actions, rewards, next_states, dones, others_states,
         others_actions, others_next_states) = experiences 
        rewards_ = rewards
        all_states = torch.cat((states, others_states), dim=1).to(device)
        all_actions = torch.cat((actions, others_actions), dim=1).to(device)
        all_next_states = torch.cat((next_states, others_next_states), dim=1).to(device)

        # --------------------------- update critic --------------------------- 
        l_all_next_actions = []
        l_all_next_actions.append(self.actor_target(states))
        l_all_next_actions.append(self.actor_target(others_states))
        all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device)

        Q_targets_next = self.critic_target(all_next_states, all_next_actions) 
        Q_targets = rewards_ + (gamma * Q_targets_next * (1 - dones)) 
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward() 
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # --------------------------- update actor --------------------------- 
        this_actions_pred = self.actor_local(states)
        others_actions_pred = self.actor_local(others_states)
        others_actions_pred = others_actions_pred.detach()
        actions_pred = torch.cat((this_actions_pred, others_actions_pred), dim=1).to(device)
        actor_loss = -self.critic_local(all_states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------------- update target networks ---------------------- 
        self.soft_update(self.critic_local, self.critic_target, hyper['TAU'])
        self.soft_update(self.actor_local, self.actor_target, hyper['TAU']) 
        
    def soft_update(self, local_model, target_model, tau): 
        iter_params = zip(target_model.parameters(), local_model.parameters())
        for target_param, local_param in iter_params:
            tensor_aux = tau*local_param.data + (1.0-tau)*target_param.data
            target_param.data.copy_(tensor_aux)
Exemplo n.º 27
0
class Agent:
    def __init__(self,
                 n_states,
                 n_actions,
                 n_goals,
                 action_bounds,
                 capacity,
                 env,
                 k_future,
                 batch_size,
                 action_size=1,
                 tau=0.05,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 gamma=0.98):
        self.device = device("cpu")
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.k_future = k_future
        self.action_bounds = action_bounds
        self.action_size = action_size
        self.env = env

        self.actor = Actor(self.n_states,
                           n_actions=self.n_actions,
                           n_goals=self.n_goals).to(self.device)
        self.critic = Critic(self.n_states,
                             action_size=self.action_size,
                             n_goals=self.n_goals).to(self.device)
        self.sync_networks(self.actor)
        self.sync_networks(self.critic)
        self.actor_target = Actor(self.n_states,
                                  n_actions=self.n_actions,
                                  n_goals=self.n_goals).to(self.device)
        self.critic_target = Critic(self.n_states,
                                    action_size=self.action_size,
                                    n_goals=self.n_goals).to(self.device)
        self.init_target_networks()
        self.tau = tau
        self.gamma = gamma

        self.capacity = capacity
        self.memory = Memory(self.capacity, self.k_future, self.env)

        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_optim = Adam(self.actor.parameters(), self.actor_lr)
        self.critic_optim = Adam(self.critic.parameters(), self.critic_lr)

        self.state_normalizer = Normalizer(self.n_states[0],
                                           default_clip_range=5)
        self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)

    def choose_action(self, state, goal, train_mode=True):
        #takes state and goal, concatenates it and passes it to actor network
        #actor returns action, to which random weird noises are added and returned
        state = self.state_normalizer.normalize(state)
        goal = self.goal_normalizer.normalize(goal)
        state = np.expand_dims(state, axis=0)
        goal = np.expand_dims(goal, axis=0)

        with torch.no_grad():
            x = np.concatenate([state, goal], axis=1)
            x = from_numpy(x).float().to(self.device)
            action = self.actor(x)[0].cpu().data.numpy()

        if train_mode:
            action += 0.2 * np.random.randn(self.n_actions)
            action = np.clip(action, self.action_bounds[0],
                             self.action_bounds[1])

            random_actions = np.random.uniform(low=self.action_bounds[0],
                                               high=self.action_bounds[1],
                                               size=self.n_actions)
            action += np.random.binomial(1, 0.3,
                                         1)[0] * (random_actions - action)

        return action

    def store(self, mini_batch):
        for batch in mini_batch:
            self.memory.add(batch)
        self._update_normalizer(mini_batch)

    def init_target_networks(self):
        self.hard_update_networks(self.actor, self.actor_target)
        self.hard_update_networks(self.critic, self.critic_target)

    @staticmethod
    def hard_update_networks(local_model, target_model):
        target_model.load_state_dict(local_model.state_dict())

    @staticmethod
    def soft_update_networks(local_model, target_model, tau=0.05):
        for t_params, e_params in zip(target_model.parameters(),
                                      local_model.parameters()):
            t_params.data.copy_(tau * e_params.data +
                                (1 - tau) * t_params.data)

    def train(self):
        states, actions, rewards, next_states, goals = self.memory.sample(
            self.batch_size)

        states = self.state_normalizer.normalize(states)
        next_states = self.state_normalizer.normalize(next_states)
        goals = self.goal_normalizer.normalize(goals)
        inputs = np.concatenate([states, goals], axis=1)
        next_inputs = np.concatenate([next_states, goals], axis=1)

        inputs = torch.Tensor(inputs).to(self.device)
        rewards = torch.Tensor(rewards).to(self.device)
        next_inputs = torch.Tensor(next_inputs).to(self.device)
        actions = torch.Tensor(actions).to(self.device)

        with torch.no_grad():
            #get Qmax
            target_q = self.critic_target(next_inputs,
                                          self.actor_target(next_inputs))
            #apply bellman equation on Qmax to get computed Q for actions from above(initial state, action)
            target_returns = rewards + self.gamma * target_q.detach()
            target_returns = torch.clamp(target_returns, -1 / (1 - self.gamma),
                                         0)

        #use critic to generate actual Q for (initial states and actions)
        q_eval = self.critic(inputs, actions)
        critic_loss = (target_returns - q_eval).pow(2).mean()

        a = self.actor(inputs)
        actor_loss = -self.critic(inputs, a).mean()
        actor_loss += a.pow(2).mean()

        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.sync_grads(self.actor)
        self.actor_optim.step()

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.sync_grads(self.critic)
        self.critic_optim.step()

        return actor_loss.item(), critic_loss.item()

    def save_weights(self):
        torch.save(
            {
                "actor_state_dict": self.actor.state_dict(),
                "state_normalizer_mean": self.state_normalizer.mean,
                "state_normalizer_std": self.state_normalizer.std,
                "goal_normalizer_mean": self.goal_normalizer.mean,
                "goal_normalizer_std": self.goal_normalizer.std
            }, "NBM_FetchPickAndPlace_v2.pth")

    def load_weights(self):

        checkpoint = torch.load("NBM_FetchPickAndPlace_v2.pth")
        actor_state_dict = checkpoint["actor_state_dict"]
        self.actor.load_state_dict(actor_state_dict)
        state_normalizer_mean = checkpoint["state_normalizer_mean"]
        self.state_normalizer.mean = state_normalizer_mean
        state_normalizer_std = checkpoint["state_normalizer_std"]
        self.state_normalizer.std = state_normalizer_std
        goal_normalizer_mean = checkpoint["goal_normalizer_mean"]
        self.goal_normalizer.mean = goal_normalizer_mean
        goal_normalizer_std = checkpoint["goal_normalizer_std"]
        self.goal_normalizer.std = goal_normalizer_std

    def set_to_eval_mode(self):
        self.actor.eval()
        # self.critic.eval()

    def update_networks(self):
        self.soft_update_networks(self.actor, self.actor_target, self.tau)
        self.soft_update_networks(self.critic, self.critic_target, self.tau)

    def _update_normalizer(self, mini_batch):
        states, goals = self.memory.sample_for_normalization(mini_batch)

        self.state_normalizer.update(states)
        self.goal_normalizer.update(goals)
        self.state_normalizer.recompute_stats()
        self.goal_normalizer.recompute_stats()

    @staticmethod
    def sync_networks(network):
        comm = MPI.COMM_WORLD
        flat_params = _get_flat_params_or_grads(network, mode='params')
        comm.Bcast(flat_params, root=0)
        _set_flat_params_or_grads(network, flat_params, mode='params')

    @staticmethod
    def sync_grads(network):
        flat_grads = _get_flat_params_or_grads(network, mode='grads')
        comm = MPI.COMM_WORLD
        global_grads = np.zeros_like(flat_grads)
        comm.Allreduce(flat_grads, global_grads, op=MPI.SUM)
        _set_flat_params_or_grads(network, global_grads, mode='grads')
class D4PGAgent(Agent):
    """An advance D4PG agent with an option to run on a simpler DDPG mode.
    The agent uses a distributional value estimation when running on D4PG vs
    the traditional single value estimation when running on DDPG mode."""
    def __init__(self, params):
        """Initialize an Agent object."""

        self.params = params
        self.update_target_every = params['update_target_every']
        self.update_every = params['update_every']
        self.actor_update_every_multiplier = params[
            'actor_update_every_multiplier']
        self.update_intensity = params['update_intensity']
        self.gamma = params['gamma']
        self.action_size = params['actor_params']['action_size']
        self.num_agents = params['num_agents']
        self.num_atoms = params['critic_params']['num_atoms']
        self.v_min = params['critic_params']['v_min']
        self.v_max = params['critic_params']['v_max']
        self.update_target_type = params['update_target_type']
        self.device = params['device']
        self.name = params['name']
        self.lr_reduction_factor = params['lr_reduction_factor']
        self.tau = params['tau']
        self.d4pg = params['d4pg']

        # Distributes the number of atoms across the range of v min and max
        self.atoms = torch.linspace(self.v_min, self.v_max,
                                    self.num_atoms).to(self.device)

        # Initialize time step count
        self.t_step = 0

        # Active and Target Actor networks
        self.actor_active = Actor(params['actor_params']).to(device)
        self.actor_target = Actor(params['actor_params']).to(device)

        if self.d4pg:
            # Active and Target D4PG Critic networks
            self.critic_active = D4PGCritic(params['critic_params']).to(device)
            self.critic_target = D4PGCritic(params['critic_params']).to(device)
        else:
            # Active and Target Critic networks
            self.critic_active = Critic(params['critic_params']).to(device)
            self.critic_target = Critic(params['critic_params']).to(device)

        self.actor_optimizer = optim.Adam(self.actor_active.parameters(),
                                          lr=params['actor_params']['lr'])
        self.critic_optimizer = optim.Adam(self.critic_active.parameters(),
                                           lr=params['critic_params']['lr'])

        self.schedule_lr = params['schedule_lr']
        self.lr_steps = 0

        # Create learning rate schedulers if required to reduce the learning rate
        # depeninding on plateuing of scores
        if self.schedule_lr:
            self.actor_scheduler = ReduceLROnPlateau(
                self.actor_optimizer,
                mode='max',
                factor=params['lr_reduction_factor'],
                patience=params['lr_patience_factor'],
                verbose=False,
            )
            self.critic_scheduler = ReduceLROnPlateau(
                self.critic_optimizer,
                mode='max',
                factor=params['lr_reduction_factor'],
                patience=params['lr_patience_factor'],
                verbose=False,
            )

        print("\n################ ACTOR ################\n")
        print(self.actor_active)

        print("\n################ CRITIC ################\n")
        print(self.critic_active)

        # Initiate exploration parameters by adding noise to the actions
        self.noise = params['noise']

        # Replay memory
        self.memory = params['experience_replay']

    def act(self, states, add_noise=True, pretrain=False):
        """Returns actions for given state as per current policy."""

        # If pretraining is active, the agent gives a random action thereby encouraging
        # intial exploration of the state space quickly
        if pretrain:
            actions = np.random.uniform(-1., 1.,
                                        (self.num_agents, self.action_size))

        else:
            with torch.no_grad():
                actions = self.actor_active(
                    states.to(device).float()).detach().to('cpu').numpy()
            if add_noise:
                noise = self.noise.create_noise(actions.shape)
                actions += noise

            actions = np.clip(actions, -1., 1.)

        return actions, self.noise.epsilon

    def step(self,
             states,
             actions,
             rewards,
             next_states,
             dones,
             pretrain=False):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        self.memory.add((states, actions, rewards, next_states, dones))
        self.t_step += 1

        if pretrain == False:
            return self.learn_()

        return None, None

    def learn_(self):
        "Learns from experience using a distributional value estimation when in D4PG mode"
        actor_loss = None
        critic_loss = None

        # If enough samples are available in memory and its time to learn, then learn!
        if self.memory.ready() and self.t_step % self.update_every == 0:

            # Learns multiple times with the same set of experience
            for _ in range(self.update_intensity):

                # Samples from the replay buffer which has calculated the n step returns in advance
                # Next state represents the state at the n'th step
                states, next_states, actions, rewards, dones = self.memory.sample(
                )

                if self.d4pg:
                    atoms = self.atoms.unsqueeze(0)

                    # Calculate log probability distribution using Zw with regards to stored actions
                    log_probs = self.critic_active(states, actions, log=True)

                    # Calculate the projected log probabilities from the target actor and critic networks
                    # Since back propogation is not required. Tensors are detach to increase speed
                    target_dist = self._get_targets(rewards,
                                                    next_states).detach()

                    # The critic loss is calculated using a weighted distribution instead of the mean to
                    # arrive at a more accurate result. Cross Entropy loss is used as it is considered to
                    # be the most ideal for categorical value distributions as utlized in the D4PG
                    critic_loss = -(target_dist * log_probs).sum(-1).mean()

                else:

                    # Get predicted next-state actions and Q values from target models
                    actions_next = self.actor_target(next_states)
                    Q_targets_next = self.critic_target(
                        next_states, actions_next).detach()
                    # Compute Q targets for current states (y_i)
                    Q_targets = rewards + (self.gamma * Q_targets_next *
                                           (1 - dones))
                    # Compute critic loss
                    Q_expected = self.critic_active(states, actions)
                    critic_loss = F.mse_loss(Q_expected, Q_targets)

                # Execute gradient descent for the critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic_active.parameters(),
                                               1)
                self.critic_optimizer.step()
                critic_loss = critic_loss.item()

                # Update actor every x multiples of critic
                if self.t_step % (self.actor_update_every_multiplier *
                                  self.update_every) == 0:

                    if self.d4pg:
                        # Predicts the action for the actor networks loss calculation
                        predicted_action = self.actor_active(states)
                        # Predict the value distribution using the critic with regards to action predicted by actor
                        probs = self.critic_active(states, predicted_action)
                        # Multiply probabilities by atom values and sum across columns to get Q values
                        expected_reward = (probs * atoms).sum(-1)
                        # Calculate the actor network loss (Policy Gradient)
                        # Get the negative of the mean across the expected rewards to do gradient ascent
                        actor_loss = -expected_reward.mean()
                    else:
                        actions_pred = self.actor_active(states)
                        actor_loss = -self.critic_active(states,
                                                         actions_pred).mean()

                    # Execute gradient ascent for the actor
                    self.actor_optimizer.zero_grad()
                    actor_loss.backward()
                    self.actor_optimizer.step()
                    actor_loss = actor_loss.item()

        # Updates the target networks every n steps
        if self.t_step % self.update_target_every == 0:
            self._update_target_networks()

        # Returns the actor and critic losses to store on tensorboard
        return actor_loss, critic_loss

    def _get_targets(self, rewards, next_states):
        """
        Calculate Yᵢ from target networks using the target actor and 
        and distributed critic networks
        """

        target_actions = self.actor_target(next_states)
        target_probs = self.critic_target(next_states, target_actions)

        # Project the categorical distribution
        projected_probs = self._get_value_distribution(rewards, target_probs)
        return projected_probs

    def _get_value_distribution(self, rewards, probs):
        """
        Returns the projected value distribution for the input state/action pair
        """

        delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)

        # Rewards were stored with the first reward followed by each of the discounted rewards, sum up the
        # reward with its discounted reward
        projected_atoms = rewards.unsqueeze(
            -1
        ) + self.gamma**self.memory.rollout_length * self.atoms.unsqueeze(0)
        projected_atoms.clamp_(self.v_min, self.v_max)
        b = (projected_atoms - self.v_min) / delta_z

        # Professional level GPUs have floating point math that is more accurate
        # to the n'th degree than traditional GPUs. This might be due to binary
        # imprecision resulting in 99.000000001 ceil() rounding to 100 instead of 99.
        # According to sources, forcibly reducing the precision seems to be the only
        # solution to the problem. Luckily it doesn't result in any complications to
        # the accuracy of calculating the lower and upper bounds correctly
        precision = 1
        b = torch.round(b * 10**precision) / 10**precision
        lower_bound = b.floor()
        upper_bound = b.ceil()

        m_lower = (upper_bound +
                   (lower_bound == upper_bound).float() - b) * probs
        m_upper = (b - lower_bound) * probs

        projected_probs = torch.tensor(np.zeros(probs.size())).to(self.device)

        for idx in range(probs.size(0)):
            projected_probs[idx].index_add_(0, lower_bound[idx].long(),
                                            m_lower[idx].double())
            projected_probs[idx].index_add_(0, upper_bound[idx].long(),
                                            m_upper[idx].double())
        return projected_probs.float()
Exemplo n.º 29
0
class DDPGAgents():
    """ Agent used to interact with and learns from the environment """
    def __init__(self, state_size, action_size, config):
        """ Initialize an agent object """

        self.state_size = state_size
        self.action_size = action_size
        self.config = config

        # retrieve number of agents
        self.num_agents = config["DDPG"]["num_agents"]

        # logging for this class
        self.logger = logging.getLogger(self.__class__.__name__)

        # gpu support
        self.device = pick_device(config, self.logger)

        ## Actor local and target networks
        self.actor_local = Actor(state_size, action_size,
                                 config).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  config).to(self.device)
        self.actor_optimizer = getattr(
            optim, config["optimizer_actor"]["optimizer_type"])(
                self.actor_local.parameters(),
                betas=tuple(config["optimizer_actor"]["betas"]),
                **config["optimizer_actor"]["optimizer_params"])

        ## Critic local and target networks
        self.critic_local = Critic(state_size, action_size,
                                   config).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    config).to(self.device)
        self.critic_optimizer = getattr(
            optim, config["optimizer_critic"]["optimizer_type"])(
                self.critic_local.parameters(),
                betas=tuple(config["optimizer_critic"]["betas"]),
                **config["optimizer_critic"]["optimizer_params"])

        ## Noise process
        self.noise = OUNoise((self.num_agents, action_size))

        ## Replay memory
        self.memory = ReplayBuffer(config=config,
                                   action_size=action_size,
                                   buffer_size=int(
                                       config["DDPG"]["buffer_size"]),
                                   batch_size=config["trainer"]["batch_size"])

    def step(self, state, action, reward, next_state, done):
        """ Save experience in replay memory, 
		and use random sample from buffer to learn """

        # Save experience in replay memory shared by all agents
        for agent in range(self.num_agents):
            self.memory.add(state[agent, :], action[agent, :], reward[agent],
                            next_state[agent, :], done[agent])

        # learn every timestep as long as enough samples are available in memory
        if len(self.memory) > self.config["trainer"]["batch_size"]:
            experiences = self.memory.sample()
            self.learn(experiences, self.config["DDPG"]["gamma"])

    def act(self, states, add_noise=False):
        """ Returns actions for given state as per current policy """

        # Convert state to tensor²
        states = torch.from_numpy(states).float().to(self.device)

        # prepare actions numpy array for all agents
        actions = np.zeros((self.num_agents, self.action_size))

        ## Evaluation mode
        self.actor_local.eval()
        with torch.no_grad():
            # Forward pass of local actor network
            for agent, state in enumerate(states):
                action_values = self.actor_local.forward(
                    state).cpu().data.numpy()
                actions[agent, :] = action_values

        # pdb.set_trace()
        ## Training mode
        self.actor_local.train()
        if add_noise:
            # Add noise to improve exploration to our actor policy
            # action_values += torch.from_numpy(self.noise.sample()).type(torch.FloatTensor).to(self.device)
            actions += self.noise.sample()
        # Clip action to stay in the range [-1, 1] for our task
        actions = np.clip(actions, -1, 1)

        return actions

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples """

        states, actions, rewards, next_states, dones = experiences

        ## Update actor (policy) network using the sampled policy gradient
        # Compute actor loss
        actions_pred = self.actor_local.forward(states)
        actor_loss = -self.critic_local.forward(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ## Update critic (value) network
        # Get predicted next-state actions and Q-values from target models
        actions_next = self.actor_target.forward(next_states)
        Q_targets_next = self.critic_target.forward(next_states, actions_next)
        # Compute Q-targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q-values from local critic model
        Q_expected = self.critic_local.forward(states, actions)
        # Compute loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ## Update target networks with a soft update
        self.soft_update(self.actor_local, self.actor_target,
                         self.config["DDPG"]["tau"])
        self.soft_update(self.critic_local, self.critic_target,
                         self.config["DDPG"]["tau"])

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters,
		improves the stability of learning """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        """ Reset noise """
        self.noise.reset()
Exemplo n.º 30
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        ### DEFINE THE ACTOR NETWORK ###
        ### INFINITE STEP BOOTSRAPPING, THEREFORE HIGH VARIANCE ###
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        ### DEFINE THE CRITIC NETWORK ###
        ### ONE STEP BOOTSRAPPING, THEREFORE HIGH BIAS ###
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        ### PROCCESS TO CREATE NOISE ###
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn at defined interval, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)