Exemplo n.º 1
0
class DDPGAgent:
    def __init__(self, state_size, action_size, num_agents,
                 hidden_actor, hidden_critic, lr_actor, lr_critic,
                 buffer_size, agent_id, use_PER=False, seed=0):

        super(DDPGAgent, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.agent_id = agent_id

        # num_agents*action_size
        self.actor_local = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device)
        self.critic_local = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device)
        self.actor_target = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device)
        self.critic_target = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device)

        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0.) #weight_decay=1.e-5

        self.memory = ReplayBuffer(buffer_size, num_agents, state_size, action_size, use_PER)

        # initialize targets same as original networks
        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)

        #self.actor_target.eval() #wont be training target net
        #self.critic_target.eval()


    def _act(self, obs):
        obs = obs.to(device)

        if len(obs.shape)==1: obs = obs.unsqueeze(0) #for batchnorm

        self.actor_local.eval()
        with torch.no_grad():
            action_local = self.actor_local(obs).squeeze()
        self.actor_local.train()

        return action_local #tensor, action_size torch.Size([2])

    def _target_act(self, obs):
        obs = obs.to(device)

        if len(obs.shape)==1: obs = obs.unsqueeze(0) #for batchnorm

        with torch.no_grad():
            action_target = self.actor_target(obs).squeeze()

        return action_target
Exemplo n.º 2
0
class Learner:
    def __init__(self, learner_id, config, dev, shared_state, shared_queue):

        self.action_size = config['action_space']
        self.obs_size = config['obs_space']

        self.shared_queue = shared_queue
        self.shared_state = shared_state

        self.dev = dev
        self.id = learner_id
        self.burn_in_length = config['burn_in_length']  # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)

        self.gamma = config['gamma']
        #        self.actor_parameter_update_interval = config['actor_parameter_update_interval']

        self.actor = ActorNet(dev, config).to(self.dev)
        self.target_actor = ActorNet(dev, config).to(self.dev)
        self.critic = CriticNet(dev, config).to(self.dev)
        self.target_critic = CriticNet(dev, config).to(self.dev)

        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(
            self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(
            self.shared_state["target_critic"].state_dict())

        #        self.actor.load_state_dict(self.shared_state["actor"])
        #        self.target_actor.load_state_dict(self.shared_state["target_actor"])
        #        self.critic.load_state_dict(self.shared_state["critic"])
        #        self.target_critic.load_state_dict(self.shared_state["target_critic"])

        self.learner_actor_rate = config['learner_actor_rate']

        self.num_actors = learner_id
        self.n_actions = 1
        self.max_frame = config['learner_max_frame']

        self.memory_sequence_size = config['memory_sequence_size']
        self.batch_size = config['batch_size']
        self.memory = LearnerReplayMemory(self.memory_sequence_size, config,
                                          dev)

        self.model_path = './'
        #        self.memory_path = './memory_data/'
        #        self.model_save_interval = 10 # 50
        self.learner_parameter_update_interval = config[
            'learner_parameter_update_interval']  # 50
        self.target_update_inverval = config['target_update_interval']  # 100

        self.gamma = config['gamma']
        self.actor_lr = config['actor_lr']
        self.critic_lr = config['critic_lr']
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()

    def __del__(self):
        self.shared_queue.close()
        self.shared_state.close()

#        self.save_model()

    def save_model(self):
        model_dict = {
            'actor': self.actor.state_dict(),
            'target_actor': self.target_actor.state_dict(),
            'critic': self.critic.state_dict(),
            'target_critic': self.target_critic.state_dict()
        }
        torch.save(model_dict, self.model_path + 'model.pt')

    def update_target_model(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def run(self):
        time_check(-1)
        while self.memory.size() < self.batch_size:
            self.memory.append(self.shared_queue.get(block=True))
            #            self.memory.append(self.shared_queue.get())
            print('\rmem size: ', self.memory.size(), end='\r')
        time_check(1)
        count_mem = 0
        frame = 0
        win_v = vis.line(Y=torch.Tensor([0]), opts=dict(title='V_loss'))
        win_p = vis.line(Y=torch.Tensor([0]), opts=dict(title='P_loss'))

        while frame < self.max_frame:
            #            sleep(0.0001)
            #            if self.shared_queue.qsize()==0 and count_mem <0:
            #                self.memory.append(self.shared_queue.get(block=True))
            #
            #            for i in range(self.shared_queue.qsize()):
            #                self.memory.append(self.shared_queue.get(block=False))
            #                count_mem += self.learner_actor_rate

            #            print('waiting  shared q {}/{}'.format(self.memory.size(),self.batch_size))

            #            self.shared_state['frame'][self.id]=frame
            #            while self.shared_state['sleep'][self.id] :
            #                sleep(0.5)
            #            if self.shared_queue.qsize()==0 and count_mem <0:
            #                self.memory.append(self.shared_queue.get(block=True))
            #                self.memory.append(self.shared_queue.get())

            #            for i in range(self.shared_queue.qsize()):
            ##                global_buf.append(self.shared_queue.get())
            #                self.memory.append(self.shared_queue.get())
            #                count_mem += self.learner_actor_rate

            if self.shared_queue.qsize() != 0:
                self.memory.append(self.shared_queue.get(block=True))

            frame += 1

            count_mem -= 1

            episode_index, sequence_index, obs_seq, action_seq, reward_seq, gamma_seq, a_state, ta_state, c_state, tc_state = self.memory.sample(
            )

            self.actor.set_state(a_state[0], a_state[1])
            self.target_actor.set_state(ta_state[0], ta_state[1])
            self.critic.set_state(c_state[0], c_state[1])
            self.target_critic.set_state(tc_state[0], tc_state[1])

            ### burn-in step ###
            _ = [self.actor(obs_seq[i]) for i in range(self.burn_in_length)]
            _ = [
                self.critic(obs_seq[i], action_seq[i])
                for i in range(self.burn_in_length)
            ]
            _ = [
                self.target_actor(obs_seq[i])
                for i in range(self.burn_in_length + self.n_step)
            ]
            _ = [
                self.target_critic(obs_seq[i], action_seq[i])
                for i in range(self.burn_in_length + self.n_step)
            ]
            ### learning steps ###

            # update ciritic
            q_value = torch.zeros(self.learning_length * self.batch_size,
                                  self.n_actions)

            target_q_value = torch.zeros(
                self.learning_length * self.batch_size, self.n_actions)
            for i in range(self.learning_length):
                obs_i = self.burn_in_length + i
                next_obs_i = self.burn_in_length + i + self.n_step
                q_value[i * self.batch_size:(i + 1) *
                        self.batch_size] = self.critic(obs_seq[obs_i],
                                                       action_seq[obs_i])
                with torch.no_grad():
                    next_q_value = self.target_critic(
                        obs_seq[next_obs_i],
                        self.target_actor(obs_seq[next_obs_i]))
                    target_q_val = reward_seq[obs_i] + (
                        gamma_seq[next_obs_i]**self.n_step) * next_q_value
                    #                target_q_val = invertical_vf(target_q_val)
                    target_q_value[i * self.batch_size:(i + 1) *
                                   self.batch_size] = target_q_val

            critic_loss = self.actor_criterion(q_value,
                                               target_q_value.detach())
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # update actor
            self.actor.reset_state()
            self.critic.reset_state()
            actor_loss = torch.zeros(self.learning_length * self.batch_size,
                                     self.n_actions).to(self.dev)
            for i in range(self.learning_length):
                obs_i = i + self.burn_in_length
                action = self.actor(obs_seq[obs_i])
                actor_loss[i * self.batch_size:(i + 1) *
                           self.batch_size] = -self.critic(
                               obs_seq[obs_i], self.actor(obs_seq[obs_i]))
            actor_loss = actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            if frame % self.target_update_inverval == 0:
                self.update_target_model()

            print('#', frame, 'critic_loss:', critic_loss.item(),
                  '  actor_loss:', actor_loss.item(), '  count:', count_mem)
            win_p = vis.line(X=torch.Tensor([frame]),
                             Y=torch.Tensor([actor_loss.item()]),
                             win=win_p,
                             update='append')
            win_v = vis.line(X=torch.Tensor([frame]),
                             Y=torch.Tensor([critic_loss.item()]),
                             win=win_v,
                             update='append')

            # calc priority
            average_td_loss = ((q_value - target_q_value)**2).detach().to(
                self.dev)

            #            average_td_loss = np.mean(((q_value - target_q_value)**2).detach().cpu().numpy() , axis = 1)
            for i in range(len(episode_index)):
                td = average_td_loss[i:-1:self.batch_size]
                self.memory.priority[episode_index[i]][
                    sequence_index[i]] = calc_priority(td).cpu().view(1, -1)
                self.memory.total_priority[episode_index[i]] = torch.cat(
                    self.memory.priority[episode_index[i]]).sum(0).view(1, -1)


#                self.memory.priority[episode_index[i]][sequence_index[i]] = calc_priority(td)
#                self.memory.total_priority[episode_index[i]] = sum(self.memory.priority[episode_index[i]])

#            if frame % self.model_save_interval == 0:
#                self.save_model()

            if frame % self.learner_parameter_update_interval == 0:
                #                print('learner update ')

                #                [self.shared_state["actor"][k] = v.cpu() for k,v in self.actor.state_dict().item() ]
                #                [self.shared_state["target_actor"][k] = v.cpu() for k,v in self.target_actor.state_dict().item() ]
                #                [self.shared_state["critic"][k] = v.cpu() for k,v in self.critic.state_dict().item() ]
                #                [self.shared_state["target_critic"][k] = v.cpu() for k,v in self.target_critic.state_dict().item() ]

                #
                #                for k,v in self.actor.state_dict().items():
                #                    self.shared_state["actor"][k] = v.cpu()
                #                for k,v in self.target_actor.state_dict().items():
                #                    self.shared_state["target_actor"][k] = v.cpu()
                #                for k,v in self.critic.state_dict().items():
                #                    self.shared_state["critic"][k] = v.cpu()
                #                for k,v in self.target_critic.state_dict().items():
                #                    self.shared_state["target_critic"][k] = v.cpu()

                #                self.shared_state["actor"] = self.actor.state_dict()
                #                self.shared_state["target_actor"] = self.target_actor.state_dict()
                #                self.shared_state["critic"] = self.critic.state_dict()
                #                self.shared_state["target_critic"] = self.target_critic.state_dict()

                self.shared_state["actor"].load_state_dict(
                    self.actor.state_dict())
                self.shared_state["critic"].load_state_dict(
                    self.critic.state_dict())
                self.shared_state["target_actor"].load_state_dict(
                    self.target_actor.state_dict())
                self.shared_state["target_critic"].load_state_dict(
                    self.target_critic.state_dict())
                for i in range(self.num_actors):
                    self.shared_state["update"][i] = True

                print('learner_update', self.actor.policy_l0.weight.data[0][0])

            self.actor.reset_state()
            self.target_actor.reset_state()
            self.critic.reset_state()
            self.target_critic.reset_state()
Exemplo n.º 3
0
class D4PG_Agent:
    """
    PyTorch Implementation of D4PG:
    "Distributed Distributional Deterministic Policy Gradients"
    (Barth-Maron, Hoffman, et al., 2018)
    As described in the paper at: https://arxiv.org/pdf/1804.08617.pdf

    Much thanks also to the original DDPG paper:
    "Continuous Control with Deep Reinforcement Learning"
    (Lillicrap, Hunt, et al., 2016)
    https://arxiv.org/pdf/1509.02971.pdf

    And to:
    "A Distributional Perspective on Reinforcement Learning"
    (Bellemare, Dabney, et al., 2017)
    https://arxiv.org/pdf/1707.06887.pdf

    D4PG utilizes distributional value estimation, n-step returns,
    prioritized experience replay (PER), distributed K-actor exploration,
    and off-policy actor-critic learning to achieve very fast and stable
    learning for continuous control tasks.

    This version of the Agent is written to interact with Udacity's
    Continuous Control robotic arm manipulation environment which provides
    20 simultaneous actors, negating the need for K-actor implementation.
    Thus, this code has no multiprocessing functionality. It could be easily
    added as part of the main.py script.

    In the original D4PG paper, it is suggested in the data that PER does
    not have significant (or perhaps any at all) effect on the speed or
    stability of learning. Thus, it too has been left out of this
    implementation but may be added as a future TODO item.
    """
    def __init__(self,
                 env,
                 args,
                 e_decay=1,
                 e_min=0.05,
                 l2_decay=0.0001,
                 update_type="hard"):
        """
        Initialize a D4PG Agent.
        """

        self.device = args.device
        self.framework = "D4PG"
        self.eval = args.eval
        self.agent_count = env.agent_count
        self.actor_learn_rate = args.actor_learn_rate
        self.critic_learn_rate = args.critic_learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.action_size = env.action_size
        self.state_size = env.state_size
        self.C = args.C
        self._e = args.e
        self.e_decay = e_decay
        self.e_min = e_min
        self.gamma = args.gamma
        self.rollout = args.rollout
        self.tau = args.tau
        self.update_type = update_type

        self.num_atoms = args.num_atoms
        self.vmin = args.vmin
        self.vmax = args.vmax
        self.atoms = torch.linspace(self.vmin, self.vmax,
                                    self.num_atoms).to(self.device)

        self.t_step = 0
        self.episode = 0

        # Set up memory buffers, currently only standard replay is implemented #
        self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma,
                                   self.rollout)

        #                    Initialize ACTOR networks                         #
        self.actor = ActorNet(args.layer_sizes, self.state_size,
                              self.action_size).to(self.device)
        self.actor_target = ActorNet(args.layer_sizes, self.state_size,
                                     self.action_size).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_learn_rate,
                                      weight_decay=l2_decay)

        #                   Initialize CRITIC networks                         #
        self.critic = CriticNet(args.layer_sizes, self.state_size,
                                self.action_size,
                                self.num_atoms).to(self.device)
        self.critic_target = CriticNet(args.layer_sizes, self.state_size,
                                       self.action_size,
                                       self.num_atoms).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_learn_rate,
                                       weight_decay=l2_decay)

        self.new_episode()

    def act(self, states, eval=False):
        """
        Predict an action using a policy/ACTOR network π.
        Scaled noise N (gaussian distribution) is added to all actions todo
        encourage exploration.
        """

        states = states.to(self.device)
        with torch.no_grad():
            actions = self.actor(states).detach().cpu().numpy()
        if not eval:
            noise = self._gauss_noise(actions.shape)
            actions += noise
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, pretrain=False):
        """
        Add the current SARS' tuple into the short term memory, then learn
        """

        # Current SARS' stored in short term memory, then stacked for NStep
        experience = list(zip(states, actions, rewards, next_states))
        self.memory.store_experience(experience)
        self.t_step += 1

        # Learn after done pretraining
        if not pretrain:
            self.learn()

    def learn(self):
        """
        Performs a distributional Actor/Critic calculation and update.
        Actor πθ and πθ'
        Critic Zw and Zw' (categorical distribution)
        """

        # Sample from replay buffer, REWARDS are sum of ROLLOUT timesteps
        # Already calculated before storing in the replay buffer.
        # NEXT_STATES are ROLLOUT steps ahead of STATES
        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states = batch
        atoms = self.atoms.unsqueeze(0)
        # Calculate Yᵢ from target networks using πθ' and Zw'
        # These tensors are not needed for backpropogation, so detach from the
        # calculation graph (literally doubles runtime if this is not detached)
        target_dist = self._get_targets(rewards, next_states).detach()

        # Calculate log probability DISTRIBUTION using Zw w.r.t. stored actions
        log_probs = self.critic(states, actions, log=True)

        # Calculate the critic network LOSS (Cross Entropy), CE-loss is ideal
        # for categorical value distributions as utilized in D4PG.
        # estimates distance between target and projected values
        critic_loss = -(target_dist * log_probs).sum(-1).mean()

        # Predict action for actor network loss calculation using πθ
        predicted_action = self.actor(states)

        # Predict value DISTRIBUTION using Zw w.r.t. action predicted by πθ
        probs = self.critic(states, predicted_action)

        # Multiply probabilities by atom values and sum across columns to get
        # Q-Value
        expected_reward = (probs * atoms).sum(-1)

        # Calculate the actor network LOSS (Policy Gradient)
        # Take the mean across the batch and multiply in the negative to
        # perform gradient ascent
        actor_loss = -expected_reward.mean()

        # Perform gradient ascent
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        # Perform gradient descent
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        self._update_networks()

        self.actor_loss = actor_loss.item()
        self.critic_loss = critic_loss.item()

    def initialize_memory(self, pretrain_length, env):
        """
        Fills up the ReplayBuffer memory with PRETRAIN_LENGTH number of experiences
        before training begins.
        """

        if len(self.memory) >= pretrain_length:
            print("Memory already filled, length: {}".format(len(self.memory)))
            return

        print("Initializing memory buffer.")
        states = env.states
        while len(self.memory) < pretrain_length:
            actions = np.random.uniform(-1, 1,
                                        (self.agent_count, self.action_size))
            next_states, rewards, dones = env.step(actions)
            self.step(states, actions, rewards, next_states, pretrain=True)
            if self.t_step % 10 == 0 or len(self.memory) >= pretrain_length:
                print("Taking pretrain step {}... memory filled: {}/{}\
                    ".format(self.t_step, len(self.memory), pretrain_length))

            states = next_states
        print("Done!")
        self.t_step = 0

    def _get_targets(self, rewards, next_states):
        """
        Calculate Yᵢ from target networks using πθ' and Zw'
        """

        target_actions = self.actor_target(next_states)
        target_probs = self.critic_target(next_states, target_actions)
        # Project the categorical distribution onto the supports
        projected_probs = self._categorical(rewards, target_probs)
        return projected_probs

    def _categorical(self, rewards, probs):
        """
        Returns the projected value distribution for the input state/action pair

        While there are several very similar implementations of this Categorical
        Projection methodology around github, this function owes the most
        inspiration to Zhang Shangtong and his excellent repository located at:
        https://github.com/ShangtongZhang
        """

        # Create local vars to keep code more concise
        vmin = self.vmin
        vmax = self.vmax
        atoms = self.atoms
        num_atoms = self.num_atoms
        gamma = self.gamma
        rollout = self.rollout

        rewards = rewards.unsqueeze(-1)
        delta_z = (vmax - vmin) / (num_atoms - 1)

        # Rewards were stored with 0->(N-1) summed, take Reward and add it to
        # the discounted expected reward at N (ROLLOUT) timesteps
        projected_atoms = rewards + gamma**rollout * atoms.unsqueeze(0)
        projected_atoms.clamp_(vmin, vmax)
        b = (projected_atoms - vmin) / delta_z

        # It seems that on professional level GPUs (for instance on AWS), the
        # floating point math is accurate to the degree that a tensor printing
        # as 99.00000 might in fact be 99.000000001 in the backend, perhaps due
        # to binary imprecision, but resulting in 99.00000...ceil() evaluating
        # to 100 instead of 99. Forcibly reducing the precision to the minimum
        # seems to be the only solution to this problem, and presents no issues
        # to the accuracy of calculating lower/upper_bound correctly.
        precision = 1
        b = torch.round(b * 10**precision) / 10**precision
        lower_bound = b.floor()
        upper_bound = b.ceil()

        m_lower = (upper_bound +
                   (lower_bound == upper_bound).float() - b) * probs
        m_upper = (b - lower_bound) * probs

        projected_probs = torch.tensor(np.zeros(probs.size())).to(self.device)

        for idx in range(probs.size(0)):
            projected_probs[idx].index_add_(0, lower_bound[idx].long(),
                                            m_lower[idx].double())
            projected_probs[idx].index_add_(0, upper_bound[idx].long(),
                                            m_upper[idx].double())
        return projected_probs.float()

    @property
    def e(self):
        """
        This property ensures that the annealing process is run every time that
        E is called.
        Anneals the epsilon rate down to a specified minimum to ensure there is
        always some noisiness to the policy actions. Returns as a property.
        """

        self._e = max(self.e_min, self._e * self.e_decay)
        return self._e

    def _gauss_noise(self, shape):
        """
        Returns the epsilon scaled noise distribution for adding to Actor
        calculated action policy.
        """

        n = np.random.normal(0, 1, shape)
        return self.e * n

    def new_episode(self):
        """
        Handle any cleanup or steps to begin a new episode of training.
        """

        self.memory.init_n_step()
        self.episode += 1

    def _update_networks(self):
        """
        Updates the network using either DDPG-style soft updates (w/ param \
        TAU), or using a DQN/D4PG style hard update every C timesteps.
        """

        if self.update_type == "soft":
            self._soft_update(self.actor, self.actor_target)
            self._soft_update(self.critic, self.critic_target)
        elif self.t_step % self.C == 0:
            self._hard_update(self.actor, self.actor_target)
            self._hard_update(self.critic, self.critic_target)

    def _soft_update(self, active, target):
        """
        Slowly updated the network using every-step partial network copies
        modulated by parameter TAU.
        """

        for t_param, param in zip(target.parameters(), active.parameters()):
            t_param.data.copy_(self.tau * param.data +
                               (1 - self.tau) * t_param.data)

    def _hard_update(self, active, target):
        """
        Fully copy parameters from active network to target network. To be used
        in conjunction with a parameter "C" that modulated how many timesteps
        between these hard updates.
        """

        target.load_state_dict(active.state_dict())
Exemplo n.º 4
0
class Agent:
    def __init__(self, env):
        """Args:
            env(gym.Core.env): environment
        """

        with open('./configuration.json') as config_file:
            self.config = json.load(config_file)['agent']

        self.env = env
        self.state = None
        self.epsilon = None
        self.epsilon_decay = None
        self.state_size = None
        self.actions_size = None
        self.actor = None
        self.actor_target = None
        self.critic = None
        self.critic_target = None
        self.actor_optim = None
        self.critic_optim = None
        self.gamma = None
        self.memory = None
        self.batch_size = None
        self.action_space = None
        self.normalizer = None

    def __str__(self):
        return 'RL_Agent Object'

    def reset(self):
        self.action_space = self.env.action_space
        obs_space = self.env.observation_space.spaces
        obs_len = obs_space['observation'].shape[0]
        goal_len = obs_space['desired_goal'].shape[0]
        self.state_size = obs_len + goal_len
        self.actions_size = self.action_space.shape[0]
        max_action = float(self.env.action_space.high[0])

        self.actor = ActorNet(self.state_size, *self.config['net_sizes'],
                              self.actions_size, max_action)
        self.critic = CriticNet(self.state_size, *self.config['net_sizes'],
                                self.actions_size)
        self.actor_target = ActorNet(self.state_size,
                                     *self.config['net_sizes'],
                                     self.actions_size, max_action)
        self.critic_target = CriticNet(self.state_size,
                                       *self.config['net_sizes'],
                                       self.actions_size)
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=self.config['learning_rate'])
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.config['learning_rate'])

        self.update(self.critic_target, self.critic, 1)
        self.update(self.actor_target, self.actor, 1)

        self.epsilon = self.config['epsilon']
        self.epsilon_decay = self.config['epsilon_decay']
        self.gamma = self.config['gamma']

        if self.config['PER']:
            self.memory = self.memory = PrioritizedMemory(
                self.config['memory_size'], self.config["memory_alpha"],
                self.config["memory_epsilon"], self.config["memory_beta"],
                self.config["memory_beta_increment"])
        else:
            self.memory = ReplayBuffer(self.config['memory_size'])

        self.batch_size = self.config['batch_size']
        self.normalizer = Normalizer(obs_len, goal_len)
        # warm up the normalizer
        self.normalizer.observe(self.env.reset())

    def run(self, train):
        total_reward = 0
        done = False
        self.state = self.env.reset()
        self.normalizer.observe(self.state)
        self.state = self.normalizer.normalize(self.state)
        ep_transitions = []

        # start episode
        while not done:
            if self.config['render']:
                self.env.render()

            # act and observe
            action = self._get_action_epsilon_greedy(self.state)
            obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # normalize the state
            self.normalizer.observe(obs)
            obs = self.normalizer.normalize(obs)

            # save the transition for later HER processing
            transition = [self.state, reward, action, obs, not done]
            ep_transitions.append(transition)

            # save to memory
            self.append_sample_to_memory(
                *copy.deepcopy((flatten_state_dict_for_model(self.state),
                                reward, action,
                                flatten_state_dict_for_model(obs), not done)))

            self.state = obs

        if random.random() < self.config["her-probability"]:
            her_trs = generate_her_transitions(ep_transitions,
                                               self.env.compute_reward,
                                               self.config['her-type'],
                                               self.config['her-k_value'])
            for t in her_trs:
                self.append_sample_to_memory(*t)

        if len(self.memory) > self.batch_size * 5 and train:
            for i in range(40):
                self._train()
            self.soft_update_networks()

        if self.epsilon > self.config['epsilon_min']:
            self.epsilon *= self.epsilon_decay

        return total_reward

    def _train(self):
        batch, indexes, importance_sampling_weights = None, None, None
        if self.config['PER']:
            batch, indexes, importance_sampling_weights = \
                self.sample_from_per_memory(self.batch_size)
            importance_sampling_weights = torch.Tensor(
                importance_sampling_weights)
        else:
            batch = self.memory.get_random_batch(self.batch_size)

        state_batch = torch.Tensor(batch[0])
        reward_batch = torch.Tensor(batch[1])
        action_batch = torch.Tensor(batch[2])
        next_state_batch = torch.Tensor(batch[3])
        # unused - see additional info in the Readme
        # mask_batch = torch.Tensor(batch[4] * 1)

        next_q_values = self.critic_target(next_state_batch,
                                           self.actor_target(next_state_batch))
        expected_q_values = reward_batch + (self.gamma * next_q_values)
        expected_q_values = expected_q_values.clamp_(-50., 0.).detach()

        self.critic_optim.zero_grad()
        q_values = self.critic(state_batch, action_batch)

        if self.config['PER']:
            errors = torch.abs(q_values - expected_q_values)
            critic_loss = (importance_sampling_weights * errors**2).sum()
            for i in range(self.batch_size):
                index = indexes[i]
                self.memory.update(index, errors[i].detach().numpy())
        else:
            critic_loss = mse_loss(q_values, expected_q_values)
        critic_loss.backward()

        self.critic_optim.step()

        self.actor_optim.zero_grad()
        policy_loss = self.critic(state_batch, self.actor(state_batch))
        action_reg = (self.actor.action_preact**2).mean()
        policy_loss = -policy_loss.mean() + action_reg
        policy_loss.backward()

        self.actor_optim.step()

    def get_action_greedy(self, state):
        """Hey, actor - act!... plus detach().numpy() ..."""
        return self.actor(flatten_state_dict_for_model(state)).detach().numpy()

    def _get_action_epsilon_greedy(self, state):
        """Returns an action for given state by using the actor network.
        With epsilon probability, it returns a fully random action.
        In both cases, there is a OU noise added as well.
        Parameters can be specified in the configuration file.
        """

        if random.random() > self.epsilon:
            action = self.get_action_greedy(state) + \
                     np.random.normal(scale=0.2, size=self.actions_size)
        else:
            action = self.env.action_space.sample()
        return np.clip(action, -1., 1.)

    def append_sample_to_memory(self, state, reward, action, next_state, done):
        """Adds given transition to the memory. In case of using Prioritized
        Experience Replay, it calculates the TD error."""
        if not self.config['PER']:
            self.memory.append((state, reward, action, next_state, done))
        else:
            q = self.critic(
                torch.Tensor(state).unsqueeze(0),
                torch.Tensor(action).unsqueeze(0))

            target_val = self.critic_target(
                torch.Tensor(next_state).unsqueeze(0),
                self.actor_target(torch.Tensor(next_state).unsqueeze(0)))

            target = reward + (self.gamma * target_val * (done * 1)).detach()
            error = abs(q - target).detach().numpy()
            self.memory.add((state, reward, action, next_state, done), error)

    def soft_update_networks(self):
        self.update(self.critic_target, self.critic,
                    self.config['network_update_amount'])
        self.update(self.actor_target, self.actor,
                    self.config['network_update_amount'])

    def update(self, target, src, amount):
        for target_param, param in zip(target.parameters(), src.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - amount) +
                                    param.data * amount)

    def sample_from_per_memory(self, batch_size):
        transition_batch, indexes, importance_sampling_weights = \
            self.memory.sample(batch_size)

        x, r, u, y, d = [], [], [], [], []
        for i in transition_batch:
            X, R, U, Y, D = i
            x.append(np.array(X, copy=False))
            y.append(np.array(Y, copy=False))
            u.append(np.array(U, copy=False))
            r.append(np.array(R, copy=False))
            d.append(np.array(D, copy=False))

        return ((np.array(x), np.array(r).reshape(-1,
                                                  1), np.array(u), np.array(y),
                 np.array(d).reshape(-1,
                                     1)), indexes, importance_sampling_weights)
Exemplo n.º 5
0
class Learner:
    def __init__(self, n_actors):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.n_actions = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.n_actors = n_actors
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.memory_sequence_size = 5000000
        self.batch_size = 32
        self.memory = LearnerReplayMemory(
            memory_sequence_size=self.memory_sequence_size,
            batch_size=self.batch_size)

        self.model_path = './model_data/'
        self.memory_path = './memory_data/'
        self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_actor = deepcopy(self.actor).eval()
        self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_critic = deepcopy(self.critic).eval()
        self.model_save_interval = 50  # 50
        self.memory_update_interval = 50  # 50
        self.target_update_inverval = 500  # 100

        self.gamma = 0.997
        self.actor_lr = 1e-4
        self.critic_lr = 1e-3
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()
        self.save_model()

    def save_model(self):
        model_dict = {
            'actor': self.actor.state_dict(),
            'target_actor': self.target_actor.state_dict(),
            'critic': self.critic.state_dict(),
            'target_critic': self.target_critic.state_dict()
        }
        torch.save(model_dict, self.model_path + 'model.pt')

    def update_target_model(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def run(self):
        # memory not enough
        while self.memory.sequence_counter < self.batch_size * 100:
            for i in range(self.n_actors):
                is_memory = os.path.isfile(self.memory_path +
                                           '/memory{}.pt'.format(i))
                if is_memory:
                    self.memory.load(i)
                sleep(0.1)
            print('learner memory sequence size:',
                  self.memory.sequence_counter)

        step = 0
        while True:
            if step % 100 == 0:
                print('learning step:', step)
            start = time()
            step += 1

            episode_index, sequence_index, obs_seq, action_seq, reward_seq, terminal_seq, a_state, ta_state, c_state, tc_state = self.memory.sample(
            )

            self.actor.set_state(a_state[0], a_state[1])
            self.target_actor.set_state(ta_state[0], ta_state[1])
            self.critic.set_state(c_state[0], c_state[1])
            self.target_critic.set_state(tc_state[0], tc_state[1])

            ### burn-in step ###
            _ = [self.actor(obs) for obs in obs_seq[0:self.burn_in_length]]
            _ = [
                self.critic(obs, action)
                for obs, action in zip(obs_seq[0:self.burn_in_length],
                                       action_seq[0:self.burn_in_length])
            ]
            _ = [
                self.target_actor(obs)
                for obs in obs_seq[0:self.burn_in_length + self.n_step]
            ]
            _ = [
                self.target_critic(obs, action) for obs, action in zip(
                    obs_seq[0:self.burn_in_length +
                            self.n_step], action_seq[0:self.burn_in_length +
                                                     self.n_step])
            ]

            ### learning steps ###

            # update ciritic
            q_value = torch.zeros(self.learning_length * self.batch_size,
                                  self.n_actions).cuda()
            target_q_value = torch.zeros(
                self.learning_length * self.batch_size, self.n_actions).cuda()
            for i in range(self.learning_length):
                obs_i = self.burn_in_length + i
                next_obs_i = self.burn_in_length + i + self.n_step
                q_value[i * self.batch_size:(i + 1) *
                        self.batch_size] = self.critic(obs_seq[obs_i],
                                                       action_seq[obs_i])
                next_q_value = self.target_critic(
                    obs_seq[next_obs_i],
                    self.target_actor(obs_seq[next_obs_i]))
                target_q_val = reward_seq[obs_i] + (
                    self.gamma**self.n_step) * (
                        1. - terminal_seq[next_obs_i - 1]) * next_q_value
                target_q_val = invertical_vf(target_q_val)
                target_q_value[i * self.batch_size:(i + 1) *
                               self.batch_size] = target_q_val

            critic_loss = self.actor_criterion(q_value,
                                               target_q_value.detach())
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # update actor
            self.actor.reset_state()
            self.critic.reset_state()
            actor_loss = torch.zeros(self.learning_length * self.batch_size,
                                     self.n_actions).cuda()
            for i in range(self.learning_length):
                obs_i = i + self.burn_in_length
                action = self.actor(obs_seq[obs_i])
                actor_loss[i * self.batch_size:(i + 1) *
                           self.batch_size] = -self.critic(
                               obs_seq[obs_i], self.actor(obs_seq[obs_i]))
            actor_loss = actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target networks
            if step % self.target_update_inverval == 0:
                self.update_target_model()

            # calc priority
            average_td_loss = np.mean(
                (q_value - target_q_value).detach().cpu().numpy()**2., axis=1)
            for i in range(len(episode_index)):
                td = average_td_loss[i:-1:self.batch_size]
                self.memory.priority[episode_index[i]][
                    sequence_index[i]] = calc_priority(td)
                self.memory.total_priority[episode_index[i]] = sum(
                    self.memory.priority[episode_index[i]])

            if step % self.model_save_interval == 0:
                self.save_model()

            if step % self.memory_update_interval == 0:
                for i in range(self.n_actors):
                    is_memory = os.path.isfile(self.memory_path +
                                               '/memory{}.pt'.format(i))
                    if is_memory:
                        self.memory.load(i)
                    sleep(0.1)

            self.actor.reset_state()
            self.target_actor.reset_state()
            self.critic.reset_state()
            self.target_critic.reset_state()