Exemplo n.º 1
0
class Agent():
    """
  The base agent class. Important: Agents should almost always be generated from a config_dict
  using mrl.util.config_to_agent(config_dict). See configs folder for default configs / examples.
  
  Agent is a flat collection of mrl.Module, which may include:
    - environments (train/eval)
    - replay buffer(s)
    - new task function
    - action function  (exploratory + greedy) 
    - loss function
    - intrinsic curiosity module 
    - value / policy networks and other models (e.g. goal generation)
    - planner (e.g., MCTS)
    - logger
    - anything else you want (image tagger, human interface, etc.)

  Agent has some lifecycle methods (process_experience, optimize, save, load) that call the 
  corresponding lifecycle hooks on modules that declare them.

  Modules have a reference to the Agent so that they can access each other via the Agent. Actually,
  modules use __getattr__ to access the agent directly (via self.*), so they are effectively agent
  methods that are defined in separate files / have their own initialize/save/load functions.

  Modules are registered and saved/restored individually. This lets you swap out / tweak individual
  agent methods without subclassing the agent. Individual saves let you swap out saved modules via
  the filesystem (good for, e.g., BatchRL), avoid pickling problems from non-picklable modules.
  """
    def __init__(
        self,
        module_list: Iterable,  # list of mrl.Modules (possibly nested)
        config: AttrDict):  # hyperparameters and module settings

        self.config = config
        parent_folder = config.parent_folder
        assert parent_folder, "Setting the agent's parent folder is required!"
        self.agent_name = config.get(
            'agent_name') or 'agent_' + short_timestamp()
        self.agent_folder = os.path.join(parent_folder, self.agent_name)
        load_agent = False
        if os.path.exists(self.agent_folder):
            print('Detected existing agent! Loading agent from checkpoint...')
            load_agent = True
        else:
            os.makedirs(self.agent_folder, exist_ok=True)

        self._process_experience_registry = [
        ]  # set of modules which define _process_experience
        self._optimize_registry = []  # set of modules which define _optimize
        self.config.env_steps = 0
        self.config.opt_steps = 0

        module_list = flatten_modules(module_list)
        self.module_dict = AttrDict()
        for module in module_list:
            assert module.module_name
            setattr(self, module.module_name, module)
            self.module_dict[module.module_name] = module
        for module in module_list:
            self._register_module(module)

        self.training = True

        if load_agent:
            self.load()
            print('Successfully loaded saved agent!')
        else:
            self.save()

    def train_mode(self):
        """Set agent to train mode; exploration / use dropout / etc. As in Pytorch."""
        self.training = True

    def eval_mode(self):
        """Set agent to eval mode; act deterministically / don't use dropout / etc."""
        self.training = False

    def process_experience(self, experience: AttrDict):
        """Calls the _process_experience function of each relevant module
    (typically, these will include a replay buffer and one or more logging modules)"""
        self.config.env_steps += self.env.num_envs if hasattr(self,
                                                              'env') else 1
        for module in self._process_experience_registry:
            module._process_experience(experience)

    def optimize(self):
        """Calls the _optimize function of each relevant module
    (typically, this will be the main algorithm; but may include others)"""
        self.config.opt_steps += 1
        for module in self._optimize_registry:
            module._optimize()

    def _register_module(self, module):
        """
    Provides module with a reference to agent so that modules can interact; e.g., 
    allows agent's policy to reference the value function.

    Then, calls each module's _setup and verify methods to _setup the module and
    verify that agent has all required modules.
    """
        self.module_dict[module.module_name] = module

        module.agent = self
        module.verify_agent_compatibility()
        module._setup()
        module.new_task()
        if hasattr(module, '_process_experience'):
            self._process_experience_registry.append(module)
        if hasattr(module, '_optimize'):
            self._optimize_registry.append(module)

    def set_module(self, module_name, module):
        """
    Sets a module (can be used to switch environments / policies)
    """
        setattr(self, module_name, module)
        self._register_module(module)

    def save(self, subfolder: Optional[str] = None):
        """
    The state of all stateful modules is saved to the agent's folder.
    The agent itself is NOT saved, and should be (1) rebuilt, and (2) restored using self.load().
    Subfolder can be used to save various checkpoints of same agent.
    """
        save_folder = self.agent_folder
        subfolder = subfolder or 'checkpoint'
        save_folder = os.path.join(save_folder, subfolder)

        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        for module in self.module_dict.values():
            module.save(save_folder)

        with open(os.path.join(save_folder, 'config.pickle'), 'wb') as f:
            pickle.dump(self.config, f)

    def load(self, subfolder: Optional[str] = None):
        """
    Restores state of stateful modules from the agent's folder[/subfolder].
    """
        save_folder = self.agent_folder
        subfolder = subfolder or 'checkpoint'
        save_folder = os.path.join(save_folder, subfolder)

        assert os.path.exists(save_folder), "load path does not exist!"

        with open(os.path.join(save_folder, 'config.pickle'), 'rb') as f:
            self.config = pickle.load(f)

        for module in self.module_dict.values():
            print("Loading module {}".format(module.module_name))
            module.load(save_folder)

    def save_checkpoint(self, checkpoint_dir):
        """
    Saves agent together with its buffer regardless of save buffer.
    Keeps 2 saves in the in folder in case the job is killed and last
    checkpoint is corrupted.

    NOTE: You should call agent.save to save to the main folder BEFORE calling this.
    """
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        with open(os.path.join(checkpoint_dir, 'INITIALIZED'), 'w') as f:
            f.write('INITIALIZED')

        subfolder1 = os.path.join(checkpoint_dir, '1')
        subfolder2 = os.path.join(checkpoint_dir, '2')

        os.makedirs(os.path.join(subfolder1, 'checkpoint'), exist_ok=True)
        os.makedirs(os.path.join(subfolder2, 'checkpoint'), exist_ok=True)

        done1 = os.path.join(subfolder1, 'DONE')
        done2 = os.path.join(subfolder2, 'DONE')

        if not os.path.exists(done1):
            savedir = subfolder1
            done_file = done1
        elif not os.path.exists(done2):
            savedir = subfolder2
            done_file = done2
        else:
            modtime1 = os.path.getmtime(done1)
            modtime2 = os.path.getmtime(done2)
            if modtime1 < modtime2:
                savedir = subfolder1
                done_file = done1
            else:
                savedir = subfolder2
                done_file = done2

            os.remove(done_file)

        savedir_checkpoint = os.path.join(savedir, 'checkpoint')
        # First save all modules, including replay buffer
        old_save_replay_buf = self.config.save_replay_buf
        self.config.save_replay_buf = True
        for module in self.module_dict.values():
            module.save(savedir_checkpoint)
        self.config.save_replay_buf = old_save_replay_buf

        # Now save the config also
        with open(os.path.join(savedir_checkpoint, 'config.pickle'),
                  'wb') as f:
            pickle.dump(self.config, f)

        # Now copy over the config and results files from the agent_folder
        files_and_folders = glob.glob(os.path.join(self.agent_folder, '*'))
        for file_or_folder in files_and_folders:
            if os.path.isfile(file_or_folder):
                shutil.copy(file_or_folder, savedir)

        # Finally, print the DONE file.
        with open(done_file, 'w') as f:
            f.write('DONE')

    def load_from_checkpoint(self, checkpoint_dir):
        """
    This loads an agent from a checkpoint_dir to which it was saved using the `save_checkpoint` method.
    """
        subfolder1 = os.path.join(checkpoint_dir, '1')
        subfolder2 = os.path.join(checkpoint_dir, '2')
        done1 = os.path.join(subfolder1, 'DONE')
        done2 = os.path.join(subfolder2, 'DONE')

        if not os.path.exists(done1):
            assert os.path.exists(done2)
            savedir = subfolder2
        elif not os.path.exists(done2):
            savedir = subfolder1
        else:
            modtime1 = os.path.getmtime(done1)
            modtime2 = os.path.getmtime(done2)
            if modtime1 > modtime2:
                savedir = subfolder1
            else:
                savedir = subfolder2

        savedir_checkpoint = os.path.join(savedir, 'checkpoint')

        # First load the agent
        with open(os.path.join(savedir_checkpoint, 'config.pickle'),
                  'rb') as f:
            self.config = pickle.load(f)

        for module in self.module_dict.values():
            print("Loading module {}".format(module.module_name))
            module.load(savedir_checkpoint)

        # Then copy over the config and results file to the agent_folder
        files_and_folders = glob.glob(os.path.join(savedir, '*'))
        for file_or_folder in files_and_folders:
            if os.path.isfile(file_or_folder):
                shutil.copy(file_or_folder, self.agent_folder)

    def torch(self, x):
        if isinstance(x, torch.Tensor): return x
        return torch.FloatTensor(x).to(self.config.device)

    def numpy(self, x):
        return x.cpu().detach().numpy()
Exemplo n.º 2
0
class EntropyPrioritizedOnlineHERBuffer(mrl.Module):

  def __init__(
      self,
      module_name='prioritized_replay',
      rank_method='dense',
      temperature=1.0
  ):
    """
    Buffer that stores entropy of trajectories for prioritized replay
    """

    super().__init__(module_name, required_agent_modules=['env','replay_buffer'], locals=locals())

    self.goal_space = None
    self.buffer = None
    self.rank_method = rank_method
    self.temperature = temperature
    self.traj_len = None

  def _setup(self):
    self.ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag

    env = self.env
    assert type(env.observation_space) == gym.spaces.Dict
    self.goal_space = env.observation_space.spaces["desired_goal"]

    # Note: for now we apply entropy estimation on the achieved goal (ag) space
    # Define the buffers to store for prioritization
    items = [("entropy", (1,)), ("priority", (1,))]
    self.buffer = AttrDict()
    for name, shape in items:
      self.buffer['buffer_' + name] = RingBuffer(self.ag_buffer.maxlen, shape=shape)

    self._subbuffers = [[] for _ in range(self.env.num_envs)]
    self.n_envs = self.env.num_envs

    # Define the placeholder for mixture model to estimate trajectory
    self.clf = 0

  def fit_density_model(self):
    ag = self.ag_buffer.data[0:self.size].copy()
    X_train = ag.reshape(-1, self.traj_len * ag.shape[-1]) # [num_episodes, episode_len * goal_dim]

    self.clf = mixture.BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_distribution", n_components=3)
    self.clf.fit(X_train)
    pred = -self.clf.score_samples(X_train)

    self.pred_min = pred.min()
    pred = pred - self.pred_min
    pred = np.clip(pred, 0, None)
    self.pred_sum = pred.sum()
    pred = pred / self.pred_sum
    self.pred_avg = (1 / pred.shape[0])
    pred = np.repeat(pred, self.traj_len, axis=0)

    self.buffer.buffer_entropy.data[:self.size] = pred.reshape(-1,1).copy()

  def _process_experience(self, exp):
    # Compute the entropy 
    # TODO: Include previous achieved goal too? or use that instead of ag?
    achieved = exp.next_state['achieved_goal']
    for i in range(self.n_envs):
      self._subbuffers[i].append([achieved[i]])
    
    for i in range(self.n_envs):
      if exp.trajectory_over[i]:
        # TODO: Compute the entropy of the trajectory
        traj_len = len(self._subbuffers[i])
        if self.traj_len is None:
          self.traj_len = traj_len
        else:
          # Current implementation assumes the same length for all trajectories
          assert(traj_len == self.traj_len)

        if not isinstance(self.clf, int):
          ag = [np.stack(a) for a in zip(*self._subbuffers[i])][0] # [episode_len, goal_dim]
          X = ag.reshape(-1, ag.shape[0]*ag.shape[1])
          pred = -self.clf.score_samples(X)

          pred = pred - self.pred_min
          pred = np.clip(pred, 0, None)
          pred = pred / self.pred_sum # Shape (1,)

          entropy = np.ones((traj_len,1)) * pred
        else:
          # Not enough data to train mixture density yet, set entropy to be zero
          entropy = np.zeros((traj_len, 1))
        
        priority = np.zeros((traj_len,1))
        trajectory = [entropy, priority]
        
        # TODO: Update the trajectory with entropy
        self.add_trajectory(*trajectory)

        self._subbuffers[i] = []

        # TODO: Update the rank here before adding it to the trajectory?
        self.update_priority()

  def add_trajectory(self, *items):
    """
    Append a trajectory of transitions to the buffer.

    :param items: a list of batched transition values to append to the replay buffer,
        in the item order that we initialized the ReplayBuffer with.
    """
    for buffer, batched_values in zip(self.buffer.values(), items):
      buffer.append_batch(batched_values)

  def update_priority(self):
    """
    After adding a trajectory to the replay buffer, update the ranking of transitions
    """
    # Note: 'dense' assigns the next highest element with the rank immediately 
    # after those assigned to the tied elements.
    entropy_transition_total = self.buffer.buffer_entropy.data[:self.size]
    entropy_rank = rankdata(entropy_transition_total, method=self.rank_method)
    entropy_rank = (entropy_rank - 1).reshape(-1, 1)
    self.buffer.buffer_priority.data[:self.size] = entropy_rank

  def __call__(self, batch_size):
    """
    Samples batch_size number of indices from main replay_buffer.

    Args:
      batch_size (int): size of the batch to sample
    
    Returns:
      batch_idxs: a 1-D numpy array of length batch_size containing indices
                  sampled in prioritized manner
    """
    if self.rank_method == 'none':
      entropy_trajectory = self.buffer.buffer_entropy.data[:self.size]
    else:
      entropy_trajectory = self.buffer.buffer_priority.data[:self.size]
    
    # Factorize out sampling into sampling trajectory according to priority/entropy
    # then sample time uniformly independently
    entropy_trajectory = entropy_trajectory.reshape(-1, self.traj_len)[:,0]
    p_trajectory = np.power(entropy_trajectory, 1/(self.temperature+1e-2))
    p_trajectory = p_trajectory / p_trajectory.sum()
    
    num_trajectories = p_trajectory.shape[0]
    batch_tidx = np.random.choice(num_trajectories, size=batch_size, p=p_trajectory)
    batch_idxs = self.traj_len * batch_tidx + np.random.choice(self.traj_len, size=batch_size)

    return batch_idxs

  @property
  def size(self):
    return len(self.ag_buffer)

  def save(self, save_folder):
    if self.config.save_replay_buf:
      state = self.buffer._get_state()
      with open(os.path.join(save_folder, "{}.pickle".format(self.module_name)), 'wb') as f:
        pickle.dump(state, f)

  def load(self, save_folder):
    load_path = os.path.join(save_folder, "{}.pickle".format(self.module_name))
    if os.path.exists(load_path):
      with open(load_path, 'rb') as f:
        state = pickle.load(f)
      self.buffer._set_state(state)
    else:
      self.logger.log_color('###############################################################', '', color='red')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='cyan')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='red')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='yellow')
      self.logger.log_color('###############################################################', '', color='red')