def _setup(self): """Sets up actor/critic optimizers and creates target network modules""" self.targets_and_models = [] # Actor setup actor_params = [] self.actors = [] for module in list(self.module_dict.values()): name = module.module_name if name.startswith('actor') and isinstance(module, PytorchModel): self.actors.append(module) actor_params += list(module.model.parameters()) target = module.copy(name + '_target') target.model.load_state_dict(module.model.state_dict()) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target.model.parameters(): p.requires_grad = False self.agent.set_module(name + '_target', target) self.targets_and_models.append((target.model, module.model)) if actor_params: self.actor_opt = torch.optim.Adam( actor_params, lr=self.config.actor_lr, weight_decay=self.config.actor_weight_decay) else: self.actor_opt = AttrDict({'state_dict': lambda: []}) self.actor_params = actor_params # Critic setup critic_params = [] self.critics = [] for module in list(self.module_dict.values()): name = module.module_name if name.startswith('critic') and isinstance(module, PytorchModel): self.critics.append(module) critic_params += list(module.model.parameters()) target = module.copy(name + '_target') target.model.load_state_dict(module.model.state_dict()) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target.model.parameters(): p.requires_grad = False self.agent.set_module(name + '_target', target) self.targets_and_models.append((target.model, module.model)) self.critic_opt = torch.optim.Adam( critic_params, lr=self.config.critic_lr, weight_decay=self.config.critic_weight_decay) self.critic_params = critic_params self.action_scale = self.env.max_action
def step(self, action): obs, reward, done, info = self.env.step(action) if self.mode == 'Bob': import ipdb ipdb.set_trace() #First visit done for Bob if np.allclose(reward, 0.): done = True info['is_success'] = True if info.get('TimeLimit.truncated'): del info['TimeLimit.truncated'] return obs, reward, done, info elif self.mode == 'Alice': import ipdb ipdb.set_trace() info = AttrDict(info) self.total_rewards += reward if done: done = False info.done_observation = obs #info.terminal_state = True if info.get('TimeLimit.truncated'): done = True info.terminal_state = False info.episodic_return = self.total_rewards self.total_rewards = 0 else: info.terminal_state = False info.episodic_return = None return obs, reward, done, info
def _overshoot_goals(self, experience, overshooting_idxs, overshooting_proposals): #score the proposals num_proposals = overshooting_proposals.shape[1] num_idxs = len(overshooting_idxs) states = np.tile(experience.reset_state['observation'][overshooting_idxs, None, :], (1, num_proposals, 1)) states = np.concatenate((states, overshooting_proposals), -1).reshape(num_proposals * num_idxs, -1) bad_q_idxs, q_values = [], None if self.use_qcutoff: q_values = self.compute_q(states) q_values = q_values.reshape(num_idxs, num_proposals) bad_q_idxs = q_values < self.cutoff goal_values = self.score_goals(overshooting_proposals, AttrDict(q_values=q_values, states=states)) if self.config.dg_score_multiplier > 1. and self.dg_kde.ready: dg_scores = self.dg_kde.evaluate_log_density(overshooting_proposals.reshape(num_proposals * num_idxs, -1)) dg_scores = dg_scores.reshape(num_idxs, num_proposals) goal_values[dg_scores > -np.inf] *= self.config.dg_score_multiplier goal_values[bad_q_idxs] = q_values[bad_q_idxs] * -1e-8 chosen_idx = np.argmin(goal_values, axis=1) chosen_idx = np.eye(num_proposals)[chosen_idx] # shape(sampled_ags) = n_envs x num_proposals chosen_ags = np.sum(overshooting_proposals * chosen_idx[:, :, None], axis=1) # n_envs x goal_feats for idx, goal in zip(overshooting_idxs, chosen_ags): self.current_goals[idx] = goal self.replaced_goal[idx] = 1.
def __call__(self, num_episodes: int, *unused_args, any_success=False): """ Runs num_steps steps in the environment and returns results. Results tracking is done here instead of in process_experience, since experiences aren't "real" experiences; e.g. agent cannot learn from them. """ self.eval_mode() env = self.eval_env num_envs = env.num_envs episode_rewards, episode_steps = [], [] discounted_episode_rewards = [] is_successes = [] record_success = False while len(episode_rewards) < num_episodes: state = env.reset() dones = np.zeros((num_envs, )) steps = np.zeros((num_envs, )) is_success = np.zeros((num_envs, )) ep_rewards = [[] for _ in range(num_envs)] while not np.all(dones): action = self.policy(state) state, reward, dones_, infos = env.step(action) for i, (rew, done, info) in enumerate(zip(reward, dones_, infos)): if dones[i]: continue ep_rewards[i].append(rew) steps[i] += 1 if done: dones[i] = 1. if 'is_success' in info: record_success = True is_success[i] = max( info['is_success'], is_success[i] ) if any_success else info['is_success'] for ep_reward, step, is_succ in zip(ep_rewards, steps, is_success): if record_success: is_successes.append(is_succ) episode_rewards.append(sum(ep_reward)) discounted_episode_rewards.append( discounted_sum(ep_reward, self.config.gamma)) episode_steps.append(step) if hasattr(self, 'logger'): if len(is_successes): self.logger.add_scalar('Test/Success', np.mean(is_successes)) self.logger.add_scalar('Test/Episode_rewards', np.mean(episode_rewards)) self.logger.add_scalar('Test/Discounted_episode_rewards', np.mean(discounted_episode_rewards)) self.logger.add_scalar('Test/Episode_steps', np.mean(episode_steps)) return AttrDict({'rewards': episode_rewards, 'steps': episode_steps})
def debug_vectorized_experience(state, action, next_state, reward, done, info): """Gym returns an ambiguous "done" signal. VecEnv doesn't let you fix it until now. See ReturnAndObsWrapper in env.py for where these info attributes are coming from.""" experience = AttrDict( state = state, action = action, reward = reward, info = info ) next_copy = deepcopy(next_state) # deepcopy handles dict states for idx in np.argwhere(done): i = idx[0] if isinstance(next_copy, np.ndarray): next_copy[i] = info[i].done_observation else: assert isinstance(next_copy, dict) for key in next_copy: next_copy[key][i] = info[i].done_observation[key] experience.next_state = next_copy experience.trajectory_over = done experience.done = np.array([info[i].terminal_state for i in range(len(done))], dtype=np.float32) experience.reset_state = next_state experience.dont_record = np.zeros(len(reward)) # Record or not trajectory in replay buffer return next_state, experience
def step(self, action): obs, reward, done, info = self.env.step(action) info = AttrDict(info) self.total_rewards += reward if done: info.done_observation = obs info.terminal_state = True if info.get('TimeLimit.truncated'): info.terminal_state = False info.episodic_return = self.total_rewards self.total_rewards = 0 else: info.terminal_state = False info.episodic_return = None return obs, reward, done, info
def config_to_agent(config_dict: dict): module_list = [] config = AttrDict() for k, v in config_dict.items(): if is_module_or_or_module_list(v): module_list += flatten_modules(v) else: config[k] = v return Agent(module_list, config)
def _setup(self): self.ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag env = self.env assert type(env.observation_space) == gym.spaces.Dict self.goal_space = env.observation_space.spaces["desired_goal"] # Note: for now we apply entropy estimation on the achieved goal (ag) space # Define the buffers to store for prioritization items = [("entropy", (1,)), ("priority", (1,))] self.buffer = AttrDict() for name, shape in items: self.buffer['buffer_' + name] = RingBuffer(self.ag_buffer.maxlen, shape=shape) self._subbuffers = [[] for _ in range(self.env.num_envs)] self.n_envs = self.env.num_envs # Define the placeholder for mixture model to estimate trajectory self.clf = 0
def __init__( self, module_list: Iterable, # list of mrl.Modules (possibly nested) config: AttrDict): # hyperparameters and module settings self.config = config parent_folder = config.parent_folder assert parent_folder, "Setting the agent's parent folder is required!" self.agent_name = config.get( 'agent_name') or 'agent_' + short_timestamp() self.agent_folder = os.path.join(parent_folder, self.agent_name) load_agent = False if os.path.exists(self.agent_folder): print('Detected existing agent! Loading agent from checkpoint...') load_agent = True else: os.makedirs(self.agent_folder, exist_ok=True) self._process_experience_registry = [ ] # set of modules which define _process_experience self._optimize_registry = [] # set of modules which define _optimize self.config.env_steps = 0 self.config.opt_steps = 0 module_list = flatten_modules(module_list) self.module_dict = AttrDict() for module in module_list: assert module.module_name setattr(self, module.module_name, module) self.module_dict[module.module_name] = module for module in module_list: self._register_module(module) self.training = True if load_agent: self.load() print('Successfully loaded saved agent!') else: self.save()
def config_to_agent(config_dict: dict): ''' The important method that actually creates the Agent (agent factory) :param config_dict: the dictionary of configuration parameters :return: the agent!!!!! ''' module_list = [] config = AttrDict() for k, v in config_dict.items(): if is_module_or_or_module_list(v): module_list += flatten_modules(v) else: config[k] = v return Agent(module_list, config)
class Agent(): """ The base agent class. Important: Agents should almost always be generated from a config_dict using mrl.util.config_to_agent(config_dict). See configs folder for default configs / examples. Agent is a flat collection of mrl.Module, which may include: - environments (train/eval) - replay buffer(s) - new task function - action function (exploratory + greedy) - loss function - intrinsic curiosity module - value / policy networks and other models (e.g. goal generation) - planner (e.g., MCTS) - logger - anything else you want (image tagger, human interface, etc.) Agent has some lifecycle methods (process_experience, optimize, save, load) that call the corresponding lifecycle hooks on modules that declare them. Modules have a reference to the Agent so that they can access each other via the Agent. Actually, modules use __getattr__ to access the agent directly (via self.*), so they are effectively agent methods that are defined in separate files / have their own initialize/save/load functions. Modules are registered and saved/restored individually. This lets you swap out / tweak individual agent methods without subclassing the agent. Individual saves let you swap out saved modules via the filesystem (good for, e.g., BatchRL), avoid pickling problems from non-picklable modules. """ def __init__( self, module_list: Iterable, # list of mrl.Modules (possibly nested) config: AttrDict): # hyperparameters and module settings self.config = config parent_folder = config.parent_folder assert parent_folder, "Setting the agent's parent folder is required!" self.agent_name = config.get( 'agent_name') or 'agent_' + short_timestamp() self.agent_folder = os.path.join(parent_folder, self.agent_name) load_agent = False if os.path.exists(self.agent_folder): print('Detected existing agent! Loading agent from checkpoint...') load_agent = True else: os.makedirs(self.agent_folder, exist_ok=True) self._process_experience_registry = [ ] # set of modules which define _process_experience self._optimize_registry = [] # set of modules which define _optimize self.config.env_steps = 0 self.config.opt_steps = 0 module_list = flatten_modules(module_list) self.module_dict = AttrDict() for module in module_list: assert module.module_name setattr(self, module.module_name, module) self.module_dict[module.module_name] = module for module in module_list: self._register_module(module) self.training = True if load_agent: self.load() print('Successfully loaded saved agent!') else: self.save() def train_mode(self): """Set agent to train mode; exploration / use dropout / etc. As in Pytorch.""" self.training = True def eval_mode(self): """Set agent to eval mode; act deterministically / don't use dropout / etc.""" self.training = False def process_experience(self, experience: AttrDict): """Calls the _process_experience function of each relevant module (typically, these will include a replay buffer and one or more logging modules)""" self.config.env_steps += self.env.num_envs if hasattr(self, 'env') else 1 for module in self._process_experience_registry: module._process_experience(experience) def optimize(self): """Calls the _optimize function of each relevant module (typically, this will be the main algorithm; but may include others)""" self.config.opt_steps += 1 for module in self._optimize_registry: module._optimize() def _register_module(self, module): """ Provides module with a reference to agent so that modules can interact; e.g., allows agent's policy to reference the value function. Then, calls each module's _setup and verify methods to _setup the module and verify that agent has all required modules. """ self.module_dict[module.module_name] = module module.agent = self module.verify_agent_compatibility() module._setup() module.new_task() if hasattr(module, '_process_experience'): self._process_experience_registry.append(module) if hasattr(module, '_optimize'): self._optimize_registry.append(module) def set_module(self, module_name, module): """ Sets a module (can be used to switch environments / policies) """ setattr(self, module_name, module) self._register_module(module) def save(self, subfolder: Optional[str] = None): """ The state of all stateful modules is saved to the agent's folder. The agent itself is NOT saved, and should be (1) rebuilt, and (2) restored using self.load(). Subfolder can be used to save various checkpoints of same agent. """ save_folder = self.agent_folder subfolder = subfolder or 'checkpoint' save_folder = os.path.join(save_folder, subfolder) if not os.path.exists(save_folder): os.makedirs(save_folder) for module in self.module_dict.values(): module.save(save_folder) with open(os.path.join(save_folder, 'config.pickle'), 'wb') as f: pickle.dump(self.config, f) def load(self, subfolder: Optional[str] = None): """ Restores state of stateful modules from the agent's folder[/subfolder]. """ save_folder = self.agent_folder subfolder = subfolder or 'checkpoint' save_folder = os.path.join(save_folder, subfolder) assert os.path.exists(save_folder), "load path does not exist!" with open(os.path.join(save_folder, 'config.pickle'), 'rb') as f: self.config = pickle.load(f) for module in self.module_dict.values(): print("Loading module {}".format(module.module_name)) module.load(save_folder) def save_checkpoint(self, checkpoint_dir): """ Saves agent together with its buffer regardless of save buffer. Keeps 2 saves in the in folder in case the job is killed and last checkpoint is corrupted. NOTE: You should call agent.save to save to the main folder BEFORE calling this. """ if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) with open(os.path.join(checkpoint_dir, 'INITIALIZED'), 'w') as f: f.write('INITIALIZED') subfolder1 = os.path.join(checkpoint_dir, '1') subfolder2 = os.path.join(checkpoint_dir, '2') os.makedirs(os.path.join(subfolder1, 'checkpoint'), exist_ok=True) os.makedirs(os.path.join(subfolder2, 'checkpoint'), exist_ok=True) done1 = os.path.join(subfolder1, 'DONE') done2 = os.path.join(subfolder2, 'DONE') if not os.path.exists(done1): savedir = subfolder1 done_file = done1 elif not os.path.exists(done2): savedir = subfolder2 done_file = done2 else: modtime1 = os.path.getmtime(done1) modtime2 = os.path.getmtime(done2) if modtime1 < modtime2: savedir = subfolder1 done_file = done1 else: savedir = subfolder2 done_file = done2 os.remove(done_file) savedir_checkpoint = os.path.join(savedir, 'checkpoint') # First save all modules, including replay buffer old_save_replay_buf = self.config.save_replay_buf self.config.save_replay_buf = True for module in self.module_dict.values(): module.save(savedir_checkpoint) self.config.save_replay_buf = old_save_replay_buf # Now save the config also with open(os.path.join(savedir_checkpoint, 'config.pickle'), 'wb') as f: pickle.dump(self.config, f) # Now copy over the config and results files from the agent_folder files_and_folders = glob.glob(os.path.join(self.agent_folder, '*')) for file_or_folder in files_and_folders: if os.path.isfile(file_or_folder): shutil.copy(file_or_folder, savedir) # Finally, print the DONE file. with open(done_file, 'w') as f: f.write('DONE') def load_from_checkpoint(self, checkpoint_dir): """ This loads an agent from a checkpoint_dir to which it was saved using the `save_checkpoint` method. """ subfolder1 = os.path.join(checkpoint_dir, '1') subfolder2 = os.path.join(checkpoint_dir, '2') done1 = os.path.join(subfolder1, 'DONE') done2 = os.path.join(subfolder2, 'DONE') if not os.path.exists(done1): assert os.path.exists(done2) savedir = subfolder2 elif not os.path.exists(done2): savedir = subfolder1 else: modtime1 = os.path.getmtime(done1) modtime2 = os.path.getmtime(done2) if modtime1 > modtime2: savedir = subfolder1 else: savedir = subfolder2 savedir_checkpoint = os.path.join(savedir, 'checkpoint') # First load the agent with open(os.path.join(savedir_checkpoint, 'config.pickle'), 'rb') as f: self.config = pickle.load(f) for module in self.module_dict.values(): print("Loading module {}".format(module.module_name)) module.load(savedir_checkpoint) # Then copy over the config and results file to the agent_folder files_and_folders = glob.glob(os.path.join(savedir, '*')) for file_or_folder in files_and_folders: if os.path.isfile(file_or_folder): shutil.copy(file_or_folder, self.agent_folder) def torch(self, x): if isinstance(x, torch.Tensor): return x return torch.FloatTensor(x).to(self.config.device) def numpy(self, x): return x.cpu().detach().numpy()
class OffPolicyActorCritic(mrl.Module): """This is the standard DDPG""" def __init__(self): super().__init__( 'algorithm', required_agent_modules=['actor','critic','replay_buffer', 'env'], locals=locals()) def _setup(self): """Sets up actor/critic optimizers and creates target network modules""" self.targets_and_models = [] # Actor setup actor_params = [] self.actors = [] for module in list(self.module_dict.values()): name = module.module_name if name.startswith('actor') and isinstance(module, PytorchModel): self.actors.append(module) actor_params += list(module.model.parameters()) target = module.copy(name + '_target') target.model.load_state_dict(module.model.state_dict()) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target.model.parameters(): p.requires_grad = False self.agent.set_module(name + '_target', target) self.targets_and_models.append((target.model, module.model)) if actor_params: self.actor_opt = torch.optim.Adam( actor_params, lr=self.config.actor_lr, weight_decay=self.config.actor_weight_decay) else: self.actor_opt = AttrDict({'state_dict': lambda: []}) self.actor_params = actor_params # Critic setup critic_params = [] self.critics = [] for module in list(self.module_dict.values()): name = module.module_name if name.startswith('critic') and isinstance(module, PytorchModel): self.critics.append(module) critic_params += list(module.model.parameters()) target = module.copy(name + '_target') target.model.load_state_dict(module.model.state_dict()) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target.model.parameters(): p.requires_grad = False self.agent.set_module(name + '_target', target) self.targets_and_models.append((target.model, module.model)) self.critic_opt = torch.optim.Adam( critic_params, lr=self.config.critic_lr, weight_decay=self.config.critic_weight_decay) self.critic_params = critic_params self.action_scale = self.env.max_action def save(self, save_folder : str): path = os.path.join(save_folder, self.module_name + '.pt') torch.save({ 'actor_opt_state_dict': self.actor_opt.state_dict(), 'critic_opt_state_dict': self.critic_opt.state_dict() }, path) def load(self, save_folder : str): path = os.path.join(save_folder, self.module_name + '.pt') checkpoint = torch.load(path) #self.actor_opt.load_state_dict(checkpoint['actor_opt_state_dict']) self.critic_opt.load_state_dict(checkpoint['critic_opt_state_dict']) def _optimize(self): if len(self.replay_buffer) > self.config.warm_up: states, actions, rewards, next_states, gammas = self.replay_buffer.sample( self.config.batch_size) self.optimize_from_batch(states, actions, rewards, next_states, gammas) if self.config.opt_steps % self.config.target_network_update_freq == 0: for target_model, model in self.targets_and_models: soft_update(target_model, model, self.config.target_network_update_frac) def optimize_from_batch(self, states, actions, rewards, next_states, gammas): raise NotImplementedError('Subclass this!')
def __init__(self, limit, item_shape, n_cpu=1): """ The replay buffer object. Stores everything in float32. :param limit: (int) the max number of transitions to store :param item_shape: a list of tuples of (str) item name and (tuple) the shape for item Ex: [("observations", env.observation_space.shape),\ ("actions",env.action_space.shape),\ ("rewards", (1,)),\ ("dones", (1,))] """ self.limit = limit global BUFF BUFF = AttrDict() self.BUFF = BUFF # a global object that has shared RawArray-based RingBuffers. BUFF.items = [] # item buffers for name, shape in item_shape: BUFF.items.append('buffer_' + name) BUFF['raw_' + name] = RawArray('f', int(np.prod((limit, ) + shape))) BUFF['np_' + name] =\ np.frombuffer(BUFF['raw_' + name], dtype=np.float32).reshape((limit, ) + shape) BUFF['buffer_' + name] = RingBuffer(limit, shape=shape, data=BUFF['np_' + name]) # special buffers BUFF.raw_tidx = RawArray('d', limit) BUFF.np_tidx = np.frombuffer(BUFF.raw_tidx, dtype=np.int64) BUFF.buffer_tidx = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tidx) BUFF.raw_tleft = RawArray('d', limit) BUFF.np_tleft = np.frombuffer(BUFF.raw_tleft, dtype=np.int64) BUFF.buffer_tleft = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tleft) if 'buffer_bg' in BUFF: # is this a successful trajectory? BUFF.raw_success = RawArray('f', limit) BUFF.np_success = np.frombuffer(BUFF.raw_success, dtype=np.float32) BUFF.buffer_success = RingBuffer(limit, shape=(), dtype=np.float32, data=BUFF.np_success) self.trajectories = OrderedDict() # a centralized dict of trajectory_id --> trajectory_idxs self.total_trajectory_len = 0 self.current_trajectory = 0 self.pool = None self.n_cpu = n_cpu if n_cpu > 1: self.pool = mp.Pool(n_cpu, initializer=worker_init, initargs=(BUFF,))
class EntropyPrioritizedOnlineHERBuffer(mrl.Module): def __init__( self, module_name='prioritized_replay', rank_method='dense', temperature=1.0 ): """ Buffer that stores entropy of trajectories for prioritized replay """ super().__init__(module_name, required_agent_modules=['env','replay_buffer'], locals=locals()) self.goal_space = None self.buffer = None self.rank_method = rank_method self.temperature = temperature self.traj_len = None def _setup(self): self.ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag env = self.env assert type(env.observation_space) == gym.spaces.Dict self.goal_space = env.observation_space.spaces["desired_goal"] # Note: for now we apply entropy estimation on the achieved goal (ag) space # Define the buffers to store for prioritization items = [("entropy", (1,)), ("priority", (1,))] self.buffer = AttrDict() for name, shape in items: self.buffer['buffer_' + name] = RingBuffer(self.ag_buffer.maxlen, shape=shape) self._subbuffers = [[] for _ in range(self.env.num_envs)] self.n_envs = self.env.num_envs # Define the placeholder for mixture model to estimate trajectory self.clf = 0 def fit_density_model(self): ag = self.ag_buffer.data[0:self.size].copy() X_train = ag.reshape(-1, self.traj_len * ag.shape[-1]) # [num_episodes, episode_len * goal_dim] self.clf = mixture.BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_distribution", n_components=3) self.clf.fit(X_train) pred = -self.clf.score_samples(X_train) self.pred_min = pred.min() pred = pred - self.pred_min pred = np.clip(pred, 0, None) self.pred_sum = pred.sum() pred = pred / self.pred_sum self.pred_avg = (1 / pred.shape[0]) pred = np.repeat(pred, self.traj_len, axis=0) self.buffer.buffer_entropy.data[:self.size] = pred.reshape(-1,1).copy() def _process_experience(self, exp): # Compute the entropy # TODO: Include previous achieved goal too? or use that instead of ag? achieved = exp.next_state['achieved_goal'] for i in range(self.n_envs): self._subbuffers[i].append([achieved[i]]) for i in range(self.n_envs): if exp.trajectory_over[i]: # TODO: Compute the entropy of the trajectory traj_len = len(self._subbuffers[i]) if self.traj_len is None: self.traj_len = traj_len else: # Current implementation assumes the same length for all trajectories assert(traj_len == self.traj_len) if not isinstance(self.clf, int): ag = [np.stack(a) for a in zip(*self._subbuffers[i])][0] # [episode_len, goal_dim] X = ag.reshape(-1, ag.shape[0]*ag.shape[1]) pred = -self.clf.score_samples(X) pred = pred - self.pred_min pred = np.clip(pred, 0, None) pred = pred / self.pred_sum # Shape (1,) entropy = np.ones((traj_len,1)) * pred else: # Not enough data to train mixture density yet, set entropy to be zero entropy = np.zeros((traj_len, 1)) priority = np.zeros((traj_len,1)) trajectory = [entropy, priority] # TODO: Update the trajectory with entropy self.add_trajectory(*trajectory) self._subbuffers[i] = [] # TODO: Update the rank here before adding it to the trajectory? self.update_priority() def add_trajectory(self, *items): """ Append a trajectory of transitions to the buffer. :param items: a list of batched transition values to append to the replay buffer, in the item order that we initialized the ReplayBuffer with. """ for buffer, batched_values in zip(self.buffer.values(), items): buffer.append_batch(batched_values) def update_priority(self): """ After adding a trajectory to the replay buffer, update the ranking of transitions """ # Note: 'dense' assigns the next highest element with the rank immediately # after those assigned to the tied elements. entropy_transition_total = self.buffer.buffer_entropy.data[:self.size] entropy_rank = rankdata(entropy_transition_total, method=self.rank_method) entropy_rank = (entropy_rank - 1).reshape(-1, 1) self.buffer.buffer_priority.data[:self.size] = entropy_rank def __call__(self, batch_size): """ Samples batch_size number of indices from main replay_buffer. Args: batch_size (int): size of the batch to sample Returns: batch_idxs: a 1-D numpy array of length batch_size containing indices sampled in prioritized manner """ if self.rank_method == 'none': entropy_trajectory = self.buffer.buffer_entropy.data[:self.size] else: entropy_trajectory = self.buffer.buffer_priority.data[:self.size] # Factorize out sampling into sampling trajectory according to priority/entropy # then sample time uniformly independently entropy_trajectory = entropy_trajectory.reshape(-1, self.traj_len)[:,0] p_trajectory = np.power(entropy_trajectory, 1/(self.temperature+1e-2)) p_trajectory = p_trajectory / p_trajectory.sum() num_trajectories = p_trajectory.shape[0] batch_tidx = np.random.choice(num_trajectories, size=batch_size, p=p_trajectory) batch_idxs = self.traj_len * batch_tidx + np.random.choice(self.traj_len, size=batch_size) return batch_idxs @property def size(self): return len(self.ag_buffer) def save(self, save_folder): if self.config.save_replay_buf: state = self.buffer._get_state() with open(os.path.join(save_folder, "{}.pickle".format(self.module_name)), 'wb') as f: pickle.dump(state, f) def load(self, save_folder): load_path = os.path.join(save_folder, "{}.pickle".format(self.module_name)) if os.path.exists(load_path): with open(load_path, 'rb') as f: state = pickle.load(f) self.buffer._set_state(state) else: self.logger.log_color('###############################################################', '', color='red') self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='cyan') self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='red') self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='yellow') self.logger.log_color('###############################################################', '', color='red')
def _process_experience(self, experience): """Curiosity module updates the desired goal depending on experience.trajectory_over""" ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag if self.current_goals is None: self.current_goals = experience.reset_state['desired_goal'] computed_reward = self.env.compute_reward( experience.next_state['achieved_goal'], self.current_goals, None) close = np.isclose(computed_reward, 0.) # First, manage the episode resets & any special behavior that occurs on goal achievement, like go explore / resets / overshooting reset_idxs, overshooting_idxs, overshooting_proposals = self._manage_resets_and_success_behaviors( experience, close) if reset_idxs: self.train.reset_next(reset_idxs) if overshooting_idxs and len(ag_buffer): self._overshoot_goals(experience, overshooting_idxs, overshooting_proposals) # Now consider replacing the current goals with something else: if np.any(experience.trajectory_over) and len(ag_buffer): # sample some achieved goals sample_idxs = np.random.randint(len(ag_buffer), size=self.num_sampled_ags * self.n_envs) sampled_ags = ag_buffer.get_batch(sample_idxs) sampled_ags = sampled_ags.reshape(self.n_envs, self.num_sampled_ags, -1) # compute the q-values of both the sampled achieved goals and the current goals states = np.tile(experience.reset_state['observation'][:, None, :], (1, self.num_sampled_ags, 1)) states = np.concatenate( (states, sampled_ags), -1).reshape(self.num_sampled_ags * self.n_envs, -1) states_curr = np.concatenate( (experience.reset_state['observation'], self.current_goals), -1) states_cat = np.concatenate((states, states_curr), 0) bad_q_idxs, q_values = [], None if self.use_qcutoff: q_values = self.compute_q(states_cat) q_values, curr_q = np.split( q_values, [self.num_sampled_ags * self.n_envs]) q_values = q_values.reshape(self.n_envs, self.num_sampled_ags) # Set cutoff dynamically by using intrinsic_success_percent if len(self.successes_deque) == 10: self.min_cutoff = max( self.min_min_cutoff, min(np.min(q_values), self.min_cutoff)) intrinsic_success_percent = np.mean(self.successes_deque) if intrinsic_success_percent >= self.config.cutoff_success_threshold[ 1]: self.cutoff = max(self.min_cutoff, self.cutoff - 1.) self.successes_deque.clear() elif intrinsic_success_percent <= self.config.cutoff_success_threshold[ 0]: self.cutoff = max( min(self.config.initial_cutoff, self.cutoff + 1.), self.min_min_cutoff) self.successes_deque.clear() # zero out the "bad" values. This practically eliminates them as candidates if any goals are viable. bad_q_idxs = q_values < self.cutoff q_values[bad_q_idxs] *= -1 min_q_values = np.min(q_values, axis=1, keepdims=True) # num_envs x1 q_values[bad_q_idxs] *= -1 # score the goals -- lower is better goal_values = self.score_goals( sampled_ags, AttrDict(q_values=q_values, states=states)) if self.config.dg_score_multiplier > 1. and self.dg_kde.ready: dg_scores = self.dg_kde.evaluate_log_density( sampled_ags.reshape(self.n_envs * self.num_sampled_ags, -1)) dg_scores = dg_scores.reshape(self.n_envs, self.num_sampled_ags) goal_values[ dg_scores > -np.inf] *= self.config.dg_score_multiplier if q_values is not None: goal_values[bad_q_idxs] = q_values[bad_q_idxs] * -1e-8 if self.randomize: # sample proportional to the absolute score abs_goal_values = np.abs(goal_values) normalized_values = abs_goal_values / np.sum( abs_goal_values, axis=1, keepdims=True) chosen_idx = (normalized_values.cumsum(1) > np.random.rand( normalized_values.shape[0])[:, None]).argmax(1) else: # take minimum chosen_idx = np.argmin(goal_values, axis=1) chosen_idx = np.eye(self.num_sampled_ags)[ chosen_idx] # shape(sampled_ags) = n_envs x num_sampled_ags if q_values is not None: chosen_q_val = (chosen_idx * q_values).sum(axis=1, keepdims=True) chosen_ags = np.sum(sampled_ags * chosen_idx[:, :, None], axis=1) # n_envs x goal_feats # replace goal always when first_visit_succ (relying on the dg_score_multiplier to dg focus), otherwise # we are going to transition into the dgs using the ag_kde_tophat if hasattr(self, 'curiosity_alpha'): if self.use_qcutoff: replace_goal = np.logical_or( (np.random.random( (self.n_envs, 1)) > self.curiosity_alpha.alpha), curr_q < self.cutoff).astype(np.float32) else: replace_goal = (np.random.random( (self.n_envs, 1)) > self.curiosity_alpha.alpha).astype( np.float32) else: replace_goal = np.ones((self.n_envs, 1), dtype=np.float32) # sometimes keep the desired goal anyways replace_goal *= (np.random.uniform(size=[self.n_envs, 1]) > self.keep_dg_percent).astype(np.float32) new_goals = replace_goal * chosen_ags + ( 1 - replace_goal) * self.current_goals if hasattr(self, 'logger') and len(self.successes) > 50: if q_values is not None: self.logger.add_histogram( 'Explore/Goal_q', replace_goal * chosen_q_val + (1 - replace_goal) * curr_q) self.logger.add_scalar('Explore/Intrinsic_success_percent', np.mean(self.successes)) self.logger.add_scalar('Explore/Cutoff', self.cutoff) self.successes = [] replace_goal = replace_goal.reshape(-1) for i in range(self.n_envs): if experience.trajectory_over[i]: self.successes.append(float(self.is_success[i, 0] >= 1.) ) # compromise due to exploration self.successes_deque.append( float(self.is_success[i, 0] >= 1.) ) # compromise due to exploration self.current_goals[i] = new_goals[i] if replace_goal[i]: self.replaced_goal[i] = 1. self.go_explore[i] = 0. self.is_success[i] = 0.