def rollout_path(env, task_params, obs_task_params, post_cond_policy): cur_eval_path_builder = PathBuilder() # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < MAX_PATH_LENGTH: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) terminal = False reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob return cur_eval_path_builder.get_all_stacked()
def load_paths(self): paths = [] for i in range(len(self.data)): p = self.data[i] H = len(p["observations"]) - 1 path_builder = PathBuilder() for t in range(H): p["observations"][t] ob = path["observations"][t, :] action = path["actions"][t, :] reward = path["rewards"][t] next_ob = path["observations"][t + 1, :] terminal = 0 agent_info = {} # todo (need to unwrap each key) env_info = {} # todo (need to unwrap each key) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) path = path_builder.get_all_stacked() paths.append(path) return paths
def rollout( env, policy, max_path_length, no_terminal=False, render=False, render_kwargs={}, ): path_builder = PathBuilder() observation = env.reset() for _ in range(max_path_length): action, agent_info = policy.get_action(observation) if render: env.render(**render_kwargs) next_ob, reward, terminal, env_info = env.step(action) if no_terminal: terminal = False path_builder.add_all( observations=observation, actions=action, rewards=np.array([reward]), next_observations=next_ob, terminals=np.array([terminal]), absorbing=np.array([0., 0.]), agent_info=agent_info, env_info=env_info, ) observation = next_ob if terminal: break return path_builder
def test_path_length(self): path = PathBuilder() for _ in range(10): path.add_all( action=np.array([1, 2, 3]), obs=-np.array([1, 2, 3]), ) self.assertEqual(len(path), 10)
def load_path(self, path, replay_buffer, obs_dict=None): rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): if obs_dict: ob = path["observations"][i][self.obs_key] next_ob = path["next_observations"][i][self.obs_key] else: ob = path["observations"][i] next_ob = path["next_observations"][i] if i == 0: current_obs = np.zeros((self.stack_obs + 1, len(ob))) current_obs[-2, :] = ob current_obs[-1, :] = next_ob else: current_obs = np.vstack((current_obs[1:, :], next_ob)) assert (current_obs[-2, :] == ob ).all(), "mismatch between obs and next_obs" obs1 = current_obs[:self.stack_obs, :].flatten() obs2 = current_obs[1:, :].flatten() action = path["actions"][i] reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]) rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=obs1, actions=action, rewards=reward, next_observations=obs2, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("path sum rewards", sum(rewards), len(rewards))
def load_path(self, path, replay_buffer, obs_dict=None): rewards = [] path_builder = PathBuilder() H = min(len(path["observations"]), len(path["actions"])) if obs_dict: traj_obs = self.preprocess(path["observations"]) next_traj_obs = self.preprocess(path["next_observations"]) else: traj_obs = self.env.encode(path["observations"]) next_traj_obs = self.env.encode(path["next_observations"]) for i in range(H): ob = traj_obs[i] next_ob = next_traj_obs[i] action = path["actions"][i] # #temp fix# # ob['state_desired_goal'] = np.zeros_like(ob['state_desired_goal']) # ob['latent_desired_goal'] = np.zeros_like(ob['latent_desired_goal']) # next_ob['state_desired_goal'] = np.zeros_like(next_ob['state_desired_goal']) # next_ob['latent_desired_goal'] = np.zeros_like(next_ob['latent_desired_goal']) # action[3] /= 5 # #temp fix# reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.reward_fn: reward = self.reward_fn(ob, action, next_ob, next_ob) reward = np.array([reward]).flatten() rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("rewards", np.min(rewards), np.max(rewards)) print("loading path, length", len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) print("path sum rewards", sum(rewards), len(rewards))
def load_path(self, path, replay_buffer, obs_dict=None): # Filter data # if not self.data_filter_fn(path): return rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): if obs_dict: ob = path["observations"][i][self.obs_key] next_ob = path["next_observations"][i][self.obs_key] else: ob = path["observations"][i] next_ob = path["next_observations"][i] action = path["actions"][i] reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]).flatten() rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("path sum rewards", sum(rewards), len(rewards))
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, task_idx): cur_eval_path_builder = PathBuilder() # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) # img = env.render(mode='rgb_array', width=200, height=200) if len(cur_eval_path_builder) % 10 == 0: # img = env.render(mode='rgb_array') env._wrapped_env._get_viewer('rgb_array').render(200, 200, camera_id=0) # window size used for old mujoco-py: data = env._wrapped_env._get_viewer('rgb_array').read_pixels(200, 200, depth=False) # original image is upside-down, so flip it img = data[::-1, :, :] imsave('plots/walker_irl_frames/walker_task_%02d_step_%03d.png' % (task_idx, len(cur_eval_path_builder)), img) terminal = False # print(env_info['l2_dist']) # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist'])) # print(agent_obs) # print(env_info['l2_dist']) reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob return cur_eval_path_builder.get_all_stacked()
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length): cur_eval_path_builder = PathBuilder() within_correct = False within_incorrect = False # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) terminal = False # print(env_info['l2_dist']) # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist'])) # print(agent_obs) # print(env_info['l2_dist']) reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob if env_info['within_radius_of_correct']: within_correct = True if env_info['within_radius_of_incorrect']: within_incorrect = True return within_correct, within_incorrect
def test_add_and_get_all(self): path = PathBuilder() path.add_all( action=np.array([1, 2, 3]), obs=-np.array([1, 2, 3]), ) path.add_all( action=np.array([10, 2, 3]), obs=-np.array([10, 2, 3]), ) result = path.get_all_stacked() self.assertNpArraysEqual(result['action'], np.array([ [1, 2, 3], [10, 2, 3], ])) self.assertNpArraysEqual(result['obs'], -np.array([ [1, 2, 3], [10, 2, 3], ]))
def load_path(self, path, replay_buffer): rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): ob = path["observations"][i] action = path["actions"][i] reward = path["rewards"][i] next_ob = path["next_observations"][i] terminal = path["terminals"][i] agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]) rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path)
def load_path(self, path, replay_buffer): path_builder = PathBuilder() for ( ob, action, reward, next_ob, terminal, agent_info, env_info, ) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): # goal = path["goal"]["state_desired_goal"][0, :] # import pdb; pdb.set_trace() # print(goal.shape, ob["state_observation"]) # state_observation = np.concatenate((ob["state_observation"], goal)) action = action[:2] reward = np.array([reward]) terminal = np.array([terminal]) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) path = path_builder.get_all_stacked() replay_buffer.add_path(path)
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, eval_expert, render): cur_eval_path_builder = PathBuilder() # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier this_roll_debug = 0.0 while (not terminal) and len(cur_eval_path_builder) < max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) terminal = False reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all(observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier) observation = next_ob this_roll_debug += env_info['reward_forward'] if render: env.render() print(this_roll_debug / 100.0) return cur_eval_path_builder.get_all_stacked()
class IRLAlgorithm(metaclass=abc.ABCMeta): ''' Generic IRL algorithm class Structure: while True: generate trajectories update reward fit policy ''' def __init__( self, env, exploration_policy: ExplorationPolicy, expert_replay_buffer, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_steps_between_updates=1000, min_steps_before_training=1000, max_path_length=1000, discount=0.99, replay_buffer_size=10000, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, save_best=False, save_best_starting_from_epoch=0, eval_sampler=None, eval_policy=None, replay_buffer=None, policy_uses_pixels=False, wrap_absorbing=False, freq_saving=1, # some environment like halfcheetah_v2 have a timelimit that defines the terminal # this is used as a minor hack to turn off time limits no_terminal=False, policy_uses_task_params=False, concat_task_params_to_policy_obs=False ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.expert_replay_buffer = expert_replay_buffer self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_steps_between_updates = num_steps_between_updates self.min_steps_before_training = min_steps_before_training self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.save_best = save_best self.save_best_starting_from_epoch = save_best_starting_from_epoch self.policy_uses_pixels = policy_uses_pixels self.policy_uses_task_params = policy_uses_task_params self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels, policy_uses_task_params=policy_uses_task_params, concat_task_params_to_policy_obs=concat_task_params_to_policy_obs ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, policy_uses_pixels=self.policy_uses_pixels, policy_uses_task_params=self.policy_uses_task_params, concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.wrap_absorbing = wrap_absorbing self.freq_saving = freq_saving self.no_terminal = no_terminal def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) steps_this_epoch = 0 while steps_this_epoch < self.num_env_steps_per_epoch: # print(steps_this_epoch) for _ in range(self.num_steps_between_updates): if isinstance(self.obs_space, Dict): if self.policy_uses_pixels: agent_obs = observation['pixels'] else: agent_obs = observation['obs'] else: agent_obs = observation if self.policy_uses_task_params: task_params = observation['obs_task_params'] if self.concat_task_params_to_policy_obs: agent_obs = np.concatenate((agent_obs, task_params), -1) else: agent_obs = {'obs': agent_obs, 'obs_task_params': task_params} action, agent_info = self._get_action_and_info( agent_obs, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action) ) if self.no_terminal: terminal = False self._n_env_steps_total += 1 reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, np.array([False]) if self.wrap_absorbing else terminal, absorbing=np.array([0., 0.]), agent_info=agent_info, env_info=env_info, ) if terminal: if self.wrap_absorbing: ''' If we wrap absorbing states, two additional transitions must be added: (s_T, s_abs) and (s_abs, s_abs). In Disc Actor Critic paper they make s_abs be a vector of 0s with last dim set to 1. Here we are going to add the following: ([next_ob,0], random_action, [next_ob, 1]) and ([next_ob,1], random_action, [next_ob, 1]) This way we can handle varying types of terminal states. ''' # next_ob is the absorbing state # for now just taking the previous action self._handle_step( next_ob, action, # env.action_space.sample(), # the reward doesn't matter reward, next_ob, np.array([False]), absorbing=np.array([0.0, 1.0]), agent_info=agent_info, env_info=env_info ) self._handle_step( next_ob, action, # env.action_space.sample(), # the reward doesn't matter reward, next_ob, np.array([False]), absorbing=np.array([1.0, 1.0]), agent_info=agent_info, env_info=env_info ) self._handle_rollout_ending() observation = self._start_new_rollout() elif len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob steps_this_epoch += 1 gt.stamp('sample') self._try_to_train(epoch) gt.stamp('train') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self, epoch): if self._can_train(): self.training_mode(True) self._do_training(epoch) self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): if epoch % self.freq_saving == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) if epoch % self.freq_saving == 0: params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() # if self._old_table_keys is not None: # print('$$$$$$$$$$$$$$$') # print(table_keys) # print('\n'*4) # print(self._old_table_keys) # print('$$$$$$$$$$$$$$$') # print(set(table_keys) - set(self._old_table_keys)) # print(set(self._old_table_keys) - set(table_keys)) # assert table_keys == self._old_table_keys, ( # "Table keys cannot change from iteration to iteration." # ) # self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return ( len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.min_steps_before_training ) def _can_train(self): return self.replay_buffer.num_steps_can_sample() >= self.min_steps_before_training def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action( observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format( time.time() - self._epoch_start_time )) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.exploration_policy.reset() return self.training_env.reset() def _handle_path(self, path): raise NotImplementedError('Does not handle absorbing states') """ Naive implementation: just loop through each transition. :param path: :return: """ for ( ob, action, reward, next_ob, terminal, agent_info, env_info ) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, absorbing, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, absorbing=absorbing, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, absorbing=absorbing, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked() ) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict( epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class RLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, eval_sampler=None, eval_policy=None, replay_buffer=None, demo_path=None, action_skip=1, experiment_name="default", mix_demo=False, ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ ### TODO: look at NormalizedBoxEnv, do we need it? ### # self.training_env = training_env or gym.make("HalfCheetah-v2") self.training_env = training_env or MujocoManipEnv( env.env.__class__.__name__) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self.demo_sampler = None self.mix_demo = mix_demo if demo_path is not None: self.demo_sampler = DemoSampler( demo_path=demo_path, observation_dim=self.obs_space.shape[0], action_dim=self.action_space.shape[0], preload=True) self.action_skip = action_skip self.action_skip_count = 0 self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] t_now = time.time() time_str = datetime.datetime.fromtimestamp(t_now).strftime( '%Y%m%d%H%M%S') os.makedirs(os.path.join(LOCAL_EXP_PATH, experiment_name, time_str)) self._writer = SummaryWriter( os.path.join(LOCAL_EXP_PATH, experiment_name, time_str)) def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) #logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob gt.stamp('sample') self._try_to_train() gt.stamp('train') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self): if self._can_train(): self.training_mode(True) for i in range(self.num_updates_per_train_call): self._do_training() self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() #print("TABLE KEYS") #print(table_keys) #if self._old_table_keys is not None: # assert table_keys == self._old_table_keys, ( # "Table keys cannot change from iteration to iteration." # ) self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) # tensorboard stuff _writer = self._writer for k, v_str in logger._tabular: if k == 'Epoch': continue v = float(v_str) if k.endswith('Loss'): _writer.add_scalar('Loss/{}'.format(k), v, epoch) elif k.endswith('Max'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Min'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Std'): prefix = k[:-4] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif k.endswith('Mean'): prefix = k[:-5] _writer.add_scalar('{}/{}'.format(prefix, k), v, epoch) elif 'Time' in k: _writer.add_scalar('Time/{}'.format(k), v, epoch) elif k.startswith('Num'): _writer.add_scalar('Number/{}'.format(k), v, epoch) elif k.startswith('Exploration'): _writer.add_scalar('Exploration/{}'.format(k), v, epoch) elif k.startswith('Test'): _writer.add_scalar('Test/{}'.format(k), v, epoch) else: _writer.add_scalar(k, v, epoch) _writer.file_writer.flush() logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return (len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.batch_size) def _can_train(self): return self.replay_buffer.num_steps_can_sample() >= self.batch_size def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) # logic for action skipping, only update the policy action every action_skip timesteps if self.action_skip_count % self.action_skip == 0: self.action_skip_action = self.exploration_policy.get_action( observation) self.action_skip_count += 1 return self.action_skip_action def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 self.action_skip_count = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.exploration_policy.reset() self.action_skip_count = 0 return self.training_env.reset() def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class RLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, num_updates_per_epoch=None, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, min_num_steps_before_training=None, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=True, eval_sampler=None, eval_policy=None, replay_buffer=None, collection_mode='online', ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made for training, so that training and evaluation are completely independent. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param min_num_steps_before_training: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: :param collection_mode: String determining how training happens - 'online': Train after every step taken in the environment. - 'batch': Train after every epoch. """ assert collection_mode in ['online', 'batch'] if collection_mode == 'batch': assert num_updates_per_epoch is not None self.training_env = training_env #or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval if collection_mode == 'online': self.num_updates_per_train_call = num_updates_per_env_step else: self.num_updates_per_train_call = num_updates_per_epoch self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.collection_mode = collection_mode self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if min_num_steps_before_training is None: min_num_steps_before_training = self.num_env_steps_per_epoch self.min_num_steps_before_training = min_num_steps_before_training if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.eval_statistics = OrderedDict() self.need_to_update_eval_statistics = True self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.post_epoch_funcs = [] def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) if self.collection_mode == 'online': self.train_online(start_epoch=start_epoch) elif self.collection_mode == 'batch': self.train_batch(start_epoch=start_epoch) else: raise TypeError("Invalid collection_mode: {}".format( self.collection_mode)) def pretrain(self): pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) def _take_step_in_env(self, observation): action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() new_observation = self._start_new_rollout() else: new_observation = next_ob return new_observation def _try_to_train(self): if self._can_train(): self.training_mode(True) for i in range(self.num_updates_per_train_call): self._do_training() self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch, eval_paths=None): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch, eval_paths=eval_paths) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. """ return (len(self._exploration_paths) > 0 and not self.need_to_update_eval_statistics) def _can_train(self): return (self.replay_buffer.num_steps_can_sample() >= self.min_num_steps_before_training) def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self, epoch): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() for post_epoch_func in self.post_epoch_funcs: post_epoch_func(self, epoch) def _start_new_rollout(self): self.exploration_policy.reset() return self.training_env.reset() def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, eval_policy=self.eval_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass def evaluate(self, epoch, eval_paths=None): statistics = OrderedDict() statistics.update(self.eval_statistics) logger.log("Collecting samples for evaluation") if eval_paths: test_paths = eval_paths else: test_paths = self.get_eval_paths() statistics.update( eval_util.get_generic_path_information( test_paths, stat_prefix="Test", )) if len(self._exploration_paths) > 0: statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths, logger=logger) if hasattr(self.env, "get_diagnostics"): statistics.update(self.env.get_diagnostics(test_paths)) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) self.need_to_update_eval_statistics = True def get_eval_paths(self): return self.eval_sampler.obtain_samples() @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class RLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, num_updates_per_epoch=None, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, min_num_steps_before_training=None, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=True, eval_sampler=None, eval_policy=None, replay_buffer=None, collection_mode='online', save_extra_data_interval=100000, num_gpus=1, num_epochs_per_eval=10, num_epochs_per_param_save=100, **kwargs ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made for training, so that training and evaluation are completely independent. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param min_num_steps_before_training: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: :param collection_mode: String determining how training happens - 'online': Train after every step taken in the environment. - 'batch': Train after every epoch. """ assert collection_mode in ['online', 'batch'] if collection_mode == 'batch': assert num_updates_per_epoch is not None self.training_env = training_env or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval if collection_mode == 'online': self.num_updates_per_train_call = num_updates_per_env_step else: self.num_updates_per_train_call = num_updates_per_epoch self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.collection_mode = collection_mode self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if min_num_steps_before_training is None: min_num_steps_before_training = self.num_env_steps_per_epoch self.min_num_steps_before_training = min_num_steps_before_training if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.eval_statistics = OrderedDict() self.need_to_update_eval_statistics = True self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.post_epoch_funcs = [] self.save_extra_data_interval = save_extra_data_interval # MPI stuff if MPI and ptu.get_mode(): self.gpu_id = MPI.COMM_WORLD.Get_rank()%num_gpus self.num_epochs_per_eval = num_epochs_per_eval assert num_epochs_per_param_save % num_epochs_per_eval == 0 self.num_epochs_per_param_save = num_epochs_per_param_save import collections # self.reward_buffer = collections.deque([-2*10], 10) def train(self, start_epoch=0): self.pretrain() if start_epoch == 0 and MPI and MPI.COMM_WORLD.Get_rank() == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) if self.collection_mode == 'online': self.train_online(start_epoch=start_epoch) elif self.collection_mode == 'batch': self.train_batch(start_epoch=start_epoch) else: raise TypeError("Invalid collection_mode: {}".format( self.collection_mode )) def pretrain(self): pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') # self.qf1_optimizer.reinit_flat_operators() #TODO what is this self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) if epoch % self.num_epochs_per_eval == 0: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) def _take_step_in_env(self, observation): action, agent_info = self._get_action_and_info( observation, ) # TODO: remove # self.qf1.pooler.current_time_step += 1 # self.qf2.pooler.current_time_step += 1 # self.vf.pooler.current_time_step += 1 # self.qf1.pooler.max_time_horizon = 50 * 2 # self.qf2.pooler.max_time_horizon = 50 * 2 # self.vf.pooler.max_time_horizon = 50 * 2 if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action) ) # self.reward_buffer.append(raw_reward) # if sum(self.reward_buffer) >= 0 and self.policy.selection_attention.hard_block == 0: # self.policy.selection_attention.hard_block = 1 # self.qf1.pooler.selection_attention.hard_block = 1 self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, mask=get_masks(self.training_env.unwrapped.num_blocks, self.replay_buffer.max_num_blocks, 1) ) # print(F"cpb len {len(self._current_path_builder)}") # print(F"terminal {terminal}") if terminal or len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() new_observation = self._start_new_rollout() else: new_observation = next_ob return new_observation def _try_to_train(self): # assert self.alpha_optimizer.param_groups[0]['params'][0] if ptu.get_mode() == "gpu_opt": ptu.set_device(device_id=self.gpu_id, device_type="gpu") self.to(device=torch.device(F"cuda:{self.gpu_id}")) # assert self.alpha_optimizer.m.device.type == "cuda" # assert self.alpha_optimizer.m.device.type == "cuda" if self._can_train(): self.training_mode(True) # assert self.alpha_optimizer.m.device.type == "cuda" for i in range(self.num_updates_per_train_call): self._do_training() # assert self.alpha_optimizer.m.device.type == "cuda" self._n_train_steps_total += 1 self.training_mode(False) if ptu.get_mode() == "gpu_opt": ptu.set_device(device_type="cpu") self.to(device=torch.device("cpu")) def _try_to_eval(self, epoch, eval_paths=None): if MPI and MPI.COMM_WORLD.Get_rank() == 0: if epoch % self.save_extra_data_interval == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) if epoch % self.num_epochs_per_param_save == 0: print("Attemping itr param save...") params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) print(F"Itr{epoch} param saved!") if self._can_evaluate(): self.evaluate(epoch, eval_paths=eval_paths) logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs # train_time = times_itrs['train'][-1] training_loops = ['get_batch', 'update_normalizer', 'forward', 'compute_losses', 'qf1_loop', "policy_loss_forward", 'policy_loop', 'vf_loop'] train_time = sum(times_itrs[loop][-1] for loop in times_itrs.keys()) sample_time = times_itrs['sample'][-1] if epoch > 0: eval_time = times_itrs['eval'][-1] else: times_itrs['eval'] = [0] # Need to do this so we can do line 343, the list comprehension eval_time = 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total # logger.record_tabular('Get Batch (s)', times_itrs['get_batch'][-1]) # logger.record_tabular('Update Normalizer (s)', times_itrs['update_normalizer'][-1]) # logger.record_tabular('Forward (s)', times_itrs['forward'][-1]) # logger.record_tabular('Compute Losses (s)', times_itrs['compute_losses'][-1]) # logger.record_tabular('QF1 Loop (s)', times_itrs['qf1_loop'][-1]) # logger.record_tabular('QF2 Loop (s)', times_itrs['qf2_loop'][-1]) # logger.record_tabular("Policy Forward (s)", times_itrs['policy_loss_forward'][-1]) # logger.record_tabular('Policy Loop (s)', times_itrs['policy_loop'][-1]) # logger.record_tabular('VF Loop (s)', times_itrs['vf_loop'][-1]) [logger.record_tabular(key.title(), times_itrs[key][-1]) for key in times_itrs.keys()] logger.record_tabular('Train Time (s) ---', train_time) logger.record_tabular('(Previous) Eval Time (s) ---', eval_time) logger.record_tabular('Sample Time (s) ---', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) table_keys = logger.get_table_key_set() if self._old_table_keys is not None and table_keys != self._old_table_keys: # assert table_keys == self._old_table_keys, ( # "Table keys cannot change from iteration to iteration." # ) print("Table keys have changed. Rewriting header and filling with 0s") logger.update_header() raise NotImplementedError self._old_table_keys = table_keys logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. """ return ( len(self._exploration_paths) > 0 and not self.need_to_update_eval_statistics ) def _can_train(self): return ( self.replay_buffer.num_steps_can_sample() >= self.min_num_steps_before_training ) def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action( observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self, epoch): logger.log("Epoch Duration: {0}".format( time.time() - self._epoch_start_time )) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() for post_epoch_func in self.post_epoch_funcs: post_epoch_func(self, epoch) def _start_new_rollout(self): self.exploration_policy.reset() return self.training_env.reset() def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for ( ob, action, reward, next_ob, terminal, agent_info, env_info ) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, # full_observations ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, # full_observations=full_observations, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, # full_observations=full_observations, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: path = self._current_path_builder.get_all_stacked() self._exploration_paths.append(path ) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, eval_policy=self.eval_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict( epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass def evaluate(self, epoch, eval_paths=None): statistics = OrderedDict() statistics.update(self.eval_statistics) logger.log("Collecting samples for evaluation") if eval_paths: test_paths = eval_paths else: test_paths = self.get_eval_paths() if hasattr(self.env.unwrapped, "num_blocks"): statistics.update(eval_util.get_generic_path_information( test_paths, stat_prefix="Test", num_blocks=self.env.unwrapped.num_blocks )) if len(self._exploration_paths) > 0: statistics.update(eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration" )) else: statistics.update(eval_util.get_generic_path_information( test_paths, stat_prefix="Test", num_blocks=None )) if len(self._exploration_paths) > 0: statistics.update(eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", num_blocks=None )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths, logger=logger) if hasattr(self.env, "get_diagnostics"): statistics.update(self.env.get_diagnostics(test_paths)) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) self.need_to_update_eval_statistics = True def get_eval_paths(self): return self.eval_sampler.obtain_samples() @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class NPMetaRLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env_sampler, exploration_policy: ExplorationPolicy, neural_process, train_neural_process=False, latent_repr_mode='concat_params', # OR concat_samples num_latent_samples=5, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, eval_sampler=None, eval_policy=None, replay_buffer=None, epoch_to_start_training=0): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ assert not train_neural_process, 'Have not implemented it yet! Remember to set it to train mode when training' self.neural_process = neural_process self.neural_process.set_mode('eval') self.latent_repr_mode = latent_repr_mode self.num_latent_samples = num_latent_samples self.env_sampler = env_sampler env, env_specs = env_sampler() self.training_env, _ = env_sampler(env_specs) # self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.epoch_to_start_training = epoch_to_start_training if self.latent_repr_mode == 'concat_params': def get_latent_repr(posterior_state): z_mean, z_cov = self.neural_process.get_posterior_params( posterior_state) return np.concatenate([z_mean, z_cov]) self.extra_obs_dim = 2 * self.neural_process.z_dim else: def get_latent_repr(posterior_state): z_mean, z_cov = self.neural_process.get_posterior_params( posterior_state) samples = np.random.multivariate_normal( z_mean, np.diag(z_cov), self.num_latent_samples) samples = samples.flatten() return samples self.extra_obs_dim = self.num_latent_samples * self.neural_process.z_dim self.get_latent_repr = get_latent_repr if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, neural_process=neural_process, latent_repr_fn=get_latent_repr, reward_scale=reward_scale) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env obs_space_dim = gym_get_dim(self.obs_space) act_space_dim = gym_get_dim(self.action_space) if replay_buffer is None: replay_buffer = SimpleReplayBuffer( self.replay_buffer_size, obs_space_dim + self.extra_obs_dim, act_space_dim, discrete_action_dim=isinstance(self.action_space, Discrete)) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self.posterior_state = self.neural_process.update_posterior_state( self.posterior_state, observation[self.extra_obs_dim:], action, reward, next_ob) next_ob = np.concatenate( [self.get_latent_repr(self.posterior_state), next_ob]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob gt.stamp('sample') if epoch >= self.epoch_to_start_training: self._try_to_train() gt.stamp('train') if epoch >= self.epoch_to_start_training: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self): if self._can_train(): self.training_mode(True) for i in range(self.num_updates_per_train_call): self._do_training() self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: print('$$$$$$$$$$$$$$$') print(table_keys) print('\n' * 4) print(self._old_table_keys) print('$$$$$$$$$$$$$$$') print(set(table_keys) - set(self._old_table_keys)) print(set(self._old_table_keys) - set(table_keys)) assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return (len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.batch_size) def _can_train(self): return self.replay_buffer.num_steps_can_sample() >= self.batch_size def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.exploration_policy.reset() self.env, env_specs = self.env_sampler() self.training_env, _ = self.env_sampler(env_specs) obs = self.training_env.reset() self.posterior_state = self.neural_process.reset_posterior_state() latent_repr = self.get_latent_repr(self.posterior_state) obs = np.concatenate([latent_repr, obs]) return obs def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class ThompsonSoftActorCritic(TorchRLAlgorithm): def __init__(self, env, policies, qf1s, qf2s, pqf1s, pqf2s, policy_lr=1e-3, qf_lr=1e-3, droprate=0.5, prior_coef=1, prior_offset=0, heads=10, policy_mean_reg_weight=1e-3, policy_std_reg_weight=1e-3, policy_pre_activation_weight=0., optimizer_class=optim.Adam, train_policy_with_reparameterization=True, soft_target_tau=1e-2, plotter=None, render_eval_paths=False, eval_deterministic=True, replay_buffer_size=1000000, use_automatic_entropy_tuning=True, target_entropy=None, **kwargs): if eval_deterministic: eval_policy = MultiMakeDeterministic(policies[0]) else: eval_policy = policies[0] self.heads = heads self.prior_coef = prior_coef replay_buffer = CustomReplayBuffer( replay_buffer_size, env, get_dim(env.observation_space) + self.heads) super().__init__(env=env, exploration_policy=policies[0], eval_policy=eval_policy, replay_buffer=replay_buffer, **kwargs) self.current_behavior_policy = 0 self.policies = policies self.qf1s = qf1s self.qf2s = qf2s self.pqf1s = pqf1s self.pqf2s = pqf2s self.droprate = droprate self.train_policy_with_reparameterization = ( train_policy_with_reparameterization) self.soft_target_tau = soft_target_tau self.policy_mean_reg_weight = policy_mean_reg_weight self.policy_std_reg_weight = policy_std_reg_weight self.policy_pre_activation_weight = policy_pre_activation_weight self.plotter = plotter self.render_eval_paths = render_eval_paths self.use_automatic_entropy_tuning = use_automatic_entropy_tuning if self.use_automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: self.target_entropy = -np.prod( self.env.action_space.shape).item( ) # heuristic value from Tuomas self.log_alphas = [ ptu.zeros(1, requires_grad=True) for _ in range(heads) ] self.alpha_optimizers = [ optimizer_class( [log_alpha], lr=policy_lr, ) for log_alpha in self.log_alphas ] self.prior_offset = prior_offset self.target_qf1s = [qf1.copy() for qf1 in qf1s] self.target_qf2s = [qf2.copy() for qf2 in qf2s] self.qf_criterion = nn.MSELoss() self.policy_optimizers = [ optimizer_class( policy.parameters(), lr=policy_lr, ) for policy in policies ] self.qf1_optimizers = [ optimizer_class( qf1.parameters(), lr=qf_lr, ) for qf1 in qf1s ] self.qf2_optimizers = [ optimizer_class( qf2.parameters(), lr=qf_lr, ) for qf2 in qf2s ] def train_head(self, head): for step in range(1, 1000): if step % 100 == 0: print(head, step) self._do_training(head) def _do_training(self, head): batch = self.get_batch() rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'][:, :-self.heads] mask = batch['observations'][:, -self.heads + head] actions = batch['actions'] next_obs = batch['next_observations'][:, :-self.heads] qf1 = self.qf1s[head] qf2 = self.qf2s[head] pqf1 = self.pqf1s[head] pqf2 = self.pqf2s[head] target_qf1 = self.target_qf1s[head] target_qf2 = self.target_qf2s[head] policy = self.policies[head] q1_pred = qf1( obs, actions) + self.prior_coef * pqf1(obs, actions) + self.prior_offset q2_pred = qf2( obs, actions) + self.prior_coef * pqf2(obs, actions) + self.prior_offset qf1_loss, qf2_loss = 0, 0 # Make sure policy accounts for squashing functions like tanh correctly! policy_outputs = policy( obs, reparameterize=self.train_policy_with_reparameterization, return_log_prob=True, ) new_actions, policy_mean, policy_log_std, log_pi = policy_outputs[:4] # new_actions: 128 x 10 x 1 if self.use_automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -(self.log_alphas[head] * (log_pi + self.target_entropy).detach()).mean() self.alpha_optimizers[head].zero_grad() alpha_loss.backward() self.alpha_optimizers[head].step() alpha = self.log_alpha.exp() else: alpha = 1 alpha_loss = 0 """ QF Loss """ next_policy_outputs = policy( next_obs, reparameterize=self.train_policy_with_reparameterization, return_log_prob=True, ) next_new_actions, _, _, next_log_pi = next_policy_outputs[:4] next_q_new_actions = torch.min( target_qf1(next_obs, next_new_actions) + self.prior_coef * pqf1(next_obs, next_new_actions) + self.prior_offset, target_qf2(next_obs, next_new_actions) + self.prior_coef * pqf2(next_obs, next_new_actions) + self.prior_offset, ) # 128 x 10 target_v_values = next_q_new_actions - alpha * next_log_pi q_target = rewards + (1. - terminals) * self.discount * target_v_values qf1_loss = (((q1_pred - q_target.detach())**2.0) * mask).sum(1) qf2_loss = (((q2_pred - q_target.detach())**2.0) * mask).sum(1) """ VF Loss """ q_new_actions = torch.min( qf1(obs, new_actions) + self.prior_coef * pqf1(obs, new_actions) + self.prior_offset, qf2(obs, new_actions) + self.prior_coef * pqf2(obs, new_actions) + self.prior_offset, ) """ Policy Loss """ if self.train_policy_with_reparameterization: kl_loss = (alpha * log_pi - q_new_actions).mean() else: #v_pred = q_new_actions - alpha*log_pi log_policy_target = q_new_actions # - v_pred kl_loss = (log_pi * (alpha * log_pi - log_policy_target).detach()).mean() mean_reg_loss = self.policy_mean_reg_weight * (policy_mean** 2).sum(1).mean() std_reg_loss = self.policy_std_reg_weight * (policy_log_std** 2).sum(1).mean() pre_tanh_value = policy_outputs[-1] pre_activation_reg_loss = self.policy_pre_activation_weight * ( (pre_tanh_value**2).sum(dim=1).mean()) policy_reg_loss = mean_reg_loss + std_reg_loss + pre_activation_reg_loss policy_loss = kl_loss + policy_reg_loss """ Update networks """ self.qf1_optimizers[head].zero_grad() qf1_loss.mean().backward() self.qf1_optimizers[head].step() self.qf2_optimizers[head].zero_grad() qf2_loss.mean().backward() self.qf2_optimizers[head].step() self.policy_optimizers[head].zero_grad() policy_loss.backward() self.policy_optimizers[head].step() self._update_target_network(head) """ Save some statistics for eval using just one batch. """ if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False self.eval_statistics['QF1 Loss'] = np.mean(ptu.get_numpy(qf1_loss)) self.eval_statistics['QF2 Loss'] = np.mean(ptu.get_numpy(qf2_loss)) #self.eval_statistics['VF Loss'] = np.mean(ptu.get_numpy(vf_loss)) self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) try: self.eval_statistics['gmm mus mean'] = np.mean( ptu.get_numpy(self.policy.mean)) self.eval_statistics['gmm log w mean'] = np.mean( ptu.get_numpy(self.policy.log_w)) self.eval_statistics['gmm log std mean'] = np.mean( ptu.get_numpy(self.policy.log_std)) except: pass self.eval_statistics['KL Loss'] = np.mean(ptu.get_numpy(kl_loss)) self.eval_statistics['Policy Reg Loss'] = np.mean( ptu.get_numpy(policy_reg_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Q1 Predictions', ptu.get_numpy(q1_pred), )) self.eval_statistics.update( create_stats_ordered_dict( 'Q2 Predictions', ptu.get_numpy(q2_pred), )) #self.eval_statistics.update(create_stats_ordered_dict( # 'V Predictions', # ptu.get_numpy(v_pred), #)) self.eval_statistics.update( create_stats_ordered_dict( 'Log Pis', ptu.get_numpy(log_pi), )) self.eval_statistics.update( create_stats_ordered_dict( 'Policy mu', ptu.get_numpy(policy_mean), )) self.eval_statistics.update( create_stats_ordered_dict( 'Policy log std', ptu.get_numpy(policy_log_std), )) if self.use_automatic_entropy_tuning: self.eval_statistics['Alpha'] = alpha.item() self.eval_statistics['Alpha Loss'] = alpha_loss.item() def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) self.training_mode(True) processes = [] import threading gt.stamp('sample') if self._can_train(): ctx = mp.get_context("spawn") for net in self.networks: # net.cuda() net.share_memory() for rank in range(0, self.heads): p = ctx.Process(target=self.train_head, args=(rank, )) p.start() processes.append(p) for step in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) #self._try_to_train() for p in processes: p.join() gt.stamp('train') self.training_mode(False) self._n_train_steps_total += self.num_env_steps_per_epoch self.current_behavior_policy = np.random.randint(self.heads) set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) def _take_step_in_env(self, observation): #action, agent_info = self._get_action_and_info( # observation, #) self.policies[0].set_num_steps_total(self._n_env_steps_total) action, agent_info = self.policies[0].get_action( observation, self.current_behavior_policy, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() new_observation = self._start_new_rollout() else: new_observation = next_ob return new_observation def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ if self.heads == 1: mask = [1] else: mask = np.array([ np.random.uniform() > self.droprate for _ in range(self.heads) ]) #np.random.randint(2, size=self.heads) #mask = [1] * self.heads observation = np.concatenate([observation, mask]) next_observation = np.concatenate( [next_observation, np.zeros(self.heads)]) self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) @property def networks(self): nets = [] nets.extend(self.qf1s) nets.extend(self.qf2s) nets.extend(self.pqf1s) nets.extend(self.pqf2s) nets.extend(self.target_qf1s) nets.extend(self.target_qf2s) nets.extend(self.policies) return nets def _update_target_network(self, head): ptu.soft_update_from_to(self.qf1s[head], self.target_qf1s[head], self.soft_target_tau) ptu.soft_update_from_to(self.qf2s[head], self.target_qf2s[head], self.soft_target_tau) def get_epoch_snapshot(self, epoch): snapshot = super().get_epoch_snapshot(epoch) for i, (qf1, qf2, pqf1, pqf2, target_qf1, target_qf2, policy) in enumerate( zip(self.qf1s, self.qf2s, self.pqf1s, self.pqf2s, self.target_qf1s, self.target_qf2s, self.policies)): snapshot["qf1_{}".format(i)] = qf1 snapshot["qf2_{}".format(i)] = qf2 snapshot["pqf1_{}".format(i)] = pqf1 snapshot["pqf2_{}".format(i)] = pqf2 snapshot["target_qf1_{}".format(i)] = target_qf1 snapshot["target_qf2_{}".format(i)] = target_qf2 snapshot["policy_{}".format(i)] = policy return snapshot
class BaseAlgorithm(metaclass=abc.ABCMeta): """ base algorithm for single task setting can be used for RL or Learning from Demonstrations """ def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, eval_policy=None, eval_sampler=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_between_train_calls=1000, num_steps_per_eval=1000, max_path_length=1000, min_steps_before_training=0, replay_buffer=None, replay_buffer_size=10000, freq_saving=1, save_replay_buffer=False, save_environment=False, save_algorithm=False, save_best=False, save_best_starting_from_epoch=0, best_key='AverageReturn', # higher is better no_terminal=False, wrap_absorbing=False, render=False, render_kwargs={}, freq_log_visuals=1, eval_deterministic=False): self.env = env self.training_env = training_env or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_between_train_calls = num_steps_between_train_calls self.num_steps_per_eval = num_steps_per_eval self.max_path_length = max_path_length self.min_steps_before_training = min_steps_before_training self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.save_best = save_best self.save_best_starting_from_epoch = save_best_starting_from_epoch self.best_key = best_key self.best_statistic_so_far = float('-Inf') if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_policy = MakeDeterministic(eval_policy) eval_sampler = PathSampler(env, eval_policy, num_steps_per_eval, max_path_length, no_terminal=no_terminal, render=render, render_kwargs=render_kwargs) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.replay_buffer_size = replay_buffer_size if replay_buffer is None: assert max_path_length < replay_buffer_size replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, random_seed=np.random.randint(10000)) else: assert max_path_length < replay_buffer._max_replay_buffer_size self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] if wrap_absorbing: # needs to be properly handled both here and in replay buffer raise NotImplementedError() self.wrap_absorbing = wrap_absorbing self.freq_saving = freq_saving self.no_terminal = no_terminal self.eval_statistics = None self.freq_log_visuals = freq_log_visuals def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.start_training(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass def start_training(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for steps_this_epoch in range(self.num_env_steps_per_epoch): action, agent_info = self._get_action_and_info(observation) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) if self.no_terminal: terminal = False self._n_env_steps_total += 1 reward = np.array([raw_reward]) terminal = np.array([terminal]) self._handle_step( observation, action, reward, next_ob, np.array([False]) if self.no_terminal else terminal, absorbing=np.array([0., 0.]), agent_info=agent_info, env_info=env_info, ) if terminal[0]: if self.wrap_absorbing: raise NotImplementedError() ''' If we wrap absorbing states, two additional transitions must be added: (s_T, s_abs) and (s_abs, s_abs). In Disc Actor Critic paper they make s_abs be a vector of 0s with last dim set to 1. Here we are going to add the following: ([next_ob,0], random_action, [next_ob, 1]) and ([next_ob,1], random_action, [next_ob, 1]) This way we can handle varying types of terminal states. ''' # next_ob is the absorbing state # for now just taking the previous action self._handle_step( next_ob, action, # env.action_space.sample(), # the reward doesn't matter reward, next_ob, np.array([False]), absorbing=np.array([0.0, 1.0]), agent_info=agent_info, env_info=env_info) self._handle_step( next_ob, action, # env.action_space.sample(), # the reward doesn't matter reward, next_ob, np.array([False]), absorbing=np.array([1.0, 1.0]), agent_info=agent_info, env_info=env_info) self._handle_rollout_ending() observation = self._start_new_rollout() elif len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob if self._n_env_steps_total % self.num_steps_between_train_calls == 0: gt.stamp('sample') self._try_to_train(epoch) gt.stamp('train') gt.stamp('sample') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self, epoch): if self._can_train(): self.training_mode(True) self._do_training(epoch) self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): if self._can_evaluate(): # save if it's time to save if (epoch % self.freq_saving == 0) or (epoch + 1 >= self.num_epochs): # if epoch + 1 >= self.num_epochs: # epoch = 'final' logger.save_extra_data(self.get_extra_data_to_save(epoch)) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) self.evaluate(epoch) logger.record_tabular( "Number of train calls total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return (len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.min_steps_before_training) def _can_train(self): return self.replay_buffer.num_steps_can_sample( ) >= self.min_steps_before_training def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): self.eval_statistics = None logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.exploration_policy.reset() return self.training_env.reset() def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, absorbing, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, absorbing=absorbing, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, absorbing=absorbing, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append(self._current_path_builder) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): """ Probably will be overridden by each algorithm """ data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save # @abc.abstractmethod # def load_snapshot(self, snapshot): # """ # Should be implemented on a per algorithm basis # taking into consideration the particular # get_epoch_snapshot implementation for the algorithm # """ # pass def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ statistics = OrderedDict() try: statistics.update(self.eval_statistics) self.eval_statistics = None except: print('No Stats to Eval') logger.log("Collecting samples for evaluation") test_paths = self.eval_sampler.obtain_samples() statistics.update( eval_util.get_generic_path_information( test_paths, stat_prefix="Test", )) statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) if hasattr(self.env, "log_statistics"): statistics.update(self.env.log_statistics(test_paths)) if epoch % self.freq_log_visuals == 0: if hasattr(self.env, "log_visuals"): self.env.log_visuals(test_paths, epoch, logger.get_snapshot_dir()) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) best_statistic = statistics[self.best_key] if best_statistic > self.best_statistic_so_far: self.best_statistic_so_far = best_statistic if self.save_best and epoch >= self.save_best_starting_from_epoch: data_to_save = {'epoch': epoch, 'statistics': statistics} data_to_save.update(self.get_epoch_snapshot(epoch)) logger.save_extra_data(data_to_save, 'best.pkl') print('\n\nSAVED BEST\n\n')
class MetaIRLAlgorithm(metaclass=abc.ABCMeta): ''' While True: generate trajectories for a batch of different task settings update the models ''' def __init__( self, env, train_context_expert_replay_buffer, train_test_expert_replay_buffer, test_context_expert_replay_buffer, test_test_expert_replay_buffer, train_task_params_sampler, test_task_params_sampler, training_env=None, num_epochs=100, num_rollouts_per_epoch=10, num_rollouts_between_updates=10, num_initial_rollouts_for_all_train_tasks=0, min_rollouts_before_training=10, max_path_length=1000, discount=0.99, replay_buffer_size_per_task=20000, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, replay_buffer=None, policy_uses_pixels=False, wrap_absorbing=False, freq_saving=1, do_not_train=False, do_not_eval=False, # some environment like halfcheetah_v2 have a timelimit that defines the terminal # this is used as a minor hack to turn off time limits no_terminal=False, save_best=False, save_best_after_epoch=0, custom_save_epoch=[], use_env_getter=False, training_env_getter=None, test_env_getter=None, get_full_obs_dict=False, **kwargs): self.use_env_getter = use_env_getter self.training_env_getter = training_env_getter self.test_env_getter = test_env_getter self.get_full_obs_dict = get_full_obs_dict if self.use_env_getter: cur_task_params, cur_obs_task_params = train_task_params_sampler.sample( ) self.training_env = self.training_env_getter(cur_obs_task_params) else: self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.train_context_expert_replay_buffer = train_context_expert_replay_buffer self.train_test_expert_replay_buffer = train_test_expert_replay_buffer self.test_context_expert_replay_buffer = test_context_expert_replay_buffer self.test_test_expert_replay_buffer = test_test_expert_replay_buffer self.num_epochs = num_epochs self.num_rollouts_per_epoch = num_rollouts_per_epoch self.num_rollouts_between_updates = num_rollouts_between_updates self.num_initial_rollouts_for_all_train_tasks = num_initial_rollouts_for_all_train_tasks self.min_rollouts_before_training = min_rollouts_before_training self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size_per_task = replay_buffer_size_per_task self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.policy_uses_pixels = policy_uses_pixels if self.use_env_getter: cur_task_params, cur_obs_task_params = test_task_params_sampler.sample( ) self.env = test_env_getter(cur_obs_task_params) else: self.env = env self.action_space = self.env.action_space self.obs_space = self.env.observation_space if replay_buffer is None: replay_buffer = MetaEnvReplayBuffer( self.replay_buffer_size_per_task, self.training_env, policy_uses_pixels=self.policy_uses_pixels, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.wrap_absorbing = wrap_absorbing if self.wrap_absorbing: assert isinstance(env, WrappedAbsorbingEnv), 'Env is not wrapped!' self.freq_saving = freq_saving self.no_terminal = no_terminal if self.no_terminal: print('\n\nDOING NO TERMINAL!\n\n') self.train_task_params_sampler = train_task_params_sampler self.test_task_params_sampler = test_task_params_sampler self.do_not_train = do_not_train self.do_not_eval = do_not_eval self.best_meta_test = np.float('-inf') self.save_best = save_best self.save_best_after_epoch = save_best_after_epoch self.custom_save_epoch = custom_save_epoch def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) # self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ if self.num_initial_rollouts_for_all_train_tasks > 0: self.generate_rollouts_for_all_train_tasks( self.num_initial_rollouts_for_all_train_tasks) print('\nGenerated Initial Task Rollouts\n') gt.stamp('initial_task_rollouts') def generate_rollouts_for_all_train_tasks(self, num_rollouts_per_task): ''' This is a simple work-around for a problem that arises when sampling batches for NP-AIRL because you need to be able to sample a minimum number of trajectories per train task. I will try to replace this with a better fix later. ''' i = 0 for task_params, obs_task_params in self.train_task_params_sampler: print('rollouts for task %d' % i) # print('new task rollout') for _ in range(num_rollouts_per_task): self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) i += 1 # exploration paths maintains the exploration paths in one epoch # so that we can analyze certain properties of the trajs if we # wanted. we don't want these trajs to count towards that really. self._exploration_paths = [] def generate_exploration_rollout(self, task_params=None, obs_task_params=None): observation, task_identifier = self._start_new_rollout( task_params=task_params, obs_task_params=obs_task_params) # _current_path_builder is initialized to a new one everytime # you call handle rollout ending # When you start a new rollout, self.exploration_policy # is set to the one for the current task terminal = False while (not terminal) and len( self._current_path_builder) < self.max_path_length: if isinstance(self.obs_space, Dict): if self.get_full_obs_dict: agent_obs = observation else: if self.policy_uses_pixels: agent_obs = observation['pixels'] else: agent_obs = observation['obs'] else: agent_obs = observation action, agent_info = self._get_action_and_info(agent_obs) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) if self.no_terminal: terminal = False self._n_env_steps_total += 1 reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, np.array([False]) if self.wrap_absorbing else terminal, task_identifier, agent_info=agent_info, env_info=env_info, ) observation = next_ob if terminal and self.wrap_absorbing: raise NotImplementedError("I think they used 0 actions for this") # next_ob is the absorbing state # for now just using the action from the previous timesteps # as well as agent info and env info self._handle_step( next_ob, action, # the reward doesn't matter cause it will be # overwritten by the model that defines the reward # e.g. the discriminator in GAIL reward, next_ob, terminal, task_identifier, agent_info=agent_info, env_info=env_info) self._handle_rollout_ending(task_identifier) def train_online(self, start_epoch=0): # No need for training mode to be True when generating trajectories # training mode is automatically set to True # in _try_to_train and before exiting # it that function it reverts it to False self.training_mode(False) self._current_path_builder = PathBuilder() self._n_rollouts_total = 0 for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) print('EPOCH STARTED') # print('epoch') for _ in range(self.num_rollouts_per_epoch): # print('rollout') task_params, obs_task_params = self.train_task_params_sampler.sample( ) self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) # print(self._n_rollouts_total) if self._n_rollouts_total % self.num_rollouts_between_updates == 0: gt.stamp('sample') # print('train') if not self.do_not_train: self._try_to_train(epoch) gt.stamp('train') if not self.do_not_eval: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self, epoch): if self._can_train(): self.training_mode(True) self._do_training(epoch) self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): if epoch % self.freq_saving == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) if epoch % self.freq_saving == 0: params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() # logger.record_tabular( # "Number of train steps total", # self._n_policy_train_steps_total, # ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return (len(self._exploration_paths) > 0 and self._n_rollouts_total >= self.min_rollouts_before_training) def _can_train(self): return self._n_rollouts_total >= self.min_rollouts_before_training def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self, task_params=None, obs_task_params=None): if self.use_env_getter: self.training_env = self.training_env_getter(obs_task_params) obs_from_reset = self.training_env.reset() observation = self.training_env._get_obs() else: if task_params is None: task_params, obs_task_params = self.train_task_params_sampler.sample( ) observation = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier self.exploration_policy = self.get_exploration_policy(task_id) self.exploration_policy.reset() return observation, task_id def _handle_path(self, path, task_identifier): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, task_identifier, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending(task_identifier) def _handle_step( self, observation, action, reward, next_observation, terminal, task_identifier, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all(observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, task_identifier=task_identifier, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self, task_identifier): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode(task_identifier) self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def get_exploration_policy(self, task_identifier): ''' Since for each task a meta-irl algorithm needs to somehow use some expert demonstrations, this is a convenience method to get a version of the policy that is handling this stuff internally. Example: In the neural process meta-irl method, for a given task we need to, take some demonstrations, infer the posterior, sample from the posterior, then conidtion the policy by concatenating the sample to any observations that are passed to the policy. So internally, in np_bc and np_airl, when we call get_exploration_policy we set the latent sample for a PostCondReparamTanhMultivariateGaussianPolicy and return that. From then on, whenever we call get_action on the policy, it internally concatenates the latent to the observation passed to it. ''' pass @abc.abstractmethod def get_eval_policy(self, task_identifier): ''' Since for each task a meta-irl algorithm needs to somehow use some expert demonstrations, this is a convenience method to get a version of the policy that is handling this stuff internally. Example: In the neural process meta-irl method, for a given task we need to, take some demonstrations, infer the posterior, sample from the posterior, then conidtion the policy by concatenating the sample to any observations that are passed to the policy. So internally, in np_bc and np_airl, when we call get_exploration_policy we set the latent sample for a PostCondReparamTanhMultivariateGaussianPolicy and return that. From then on, whenever we call get_action on the policy, it internally concatenates the latent to the observation passed to it. ''' pass @abc.abstractmethod def obtain_eval_samples(self, epoch): pass @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def cpu(self): """ Turn cuda off. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class RLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=True, eval_sampler=None, eval_policy=None, replay_buffer=None, ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.num_skills = 5 # added the num skills right here!! self.pz = np.full(self.num_skills, 1. / self.num_skills) #self.curr_z = self.sample_z() if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass ''' TODO: Write a function for sample z here''' def sample_z(self): ''' sample z''' dummy = np.zeros((self.num_skills)) dummy[np.random.choice(self.num_skills, p=self.pz)] = 1 # pdb.set_trace() return dummy ''' TODO: concat funciton''' def concat_state_z(self, state, z): return np.concatenate([state, z], axis=0) def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() #observation = self.concat_state_z(state, self.curr_z) for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): ''' TODO''' ''' append the latent variable here''' action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_state, raw_reward, terminal, env_info = ( self.training_env.step(action)) # print (terminal) next_ob = self.concat_state_z(next_state, self.curr_z) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() #print ('starting new rollout') else: observation = next_ob gt.stamp('sample') self._try_to_train() gt.stamp('train') # need to fix the evaluation here..figure this out!! # self._try_to_eval(epoch) # gt.stamp('eval') # self._end_epoch() def _try_to_train(self): if self._can_train(): self.training_mode(True) for i in range(self.num_updates_per_train_call): self._do_training() self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return (len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.batch_size) def _can_train(self): return self.replay_buffer.num_steps_can_sample() >= self.batch_size def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ #print (observation.shape) self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.curr_z = self.sample_z() self.exploration_policy.reset() return self.concat_state_z(self.training_env.reset(), self.curr_z) def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): if self.render: self.training_env.render(close=True) data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
class RLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, max_num_episodes=None, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, save_best=False, save_best_starting_from_epoch=0, eval_sampler=None, eval_policy=None, replay_buffer=None, # for compatibility with deepmind control suite # Right now the semantics is that if observations is not a dictionary # then it means the policy just uses that. If it's a dictionary, it # checks whether policy_uses_pixels to see if it's true or false and # based on that it decides whether the policy takes 'pixels' or 'obs' # from the dictionary policy_uses_pixels=False, freq_saving=1, # for meta-learning policy_uses_task_params=False, # whether the policy uses the task parameters concat_task_params_to_policy_obs=False, # how the policy sees the task parameters # this is useful when you want to generate trajectories from the expert using the # exploration policy do_not_train=False, # some environment like halfcheetah_v2 have a timelimit that defines the terminal # this is used as a minor hack to turn off time limits no_terminal=False, **kwargs ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.save_best = save_best self.save_best_starting_from_epoch = save_best_starting_from_epoch self.policy_uses_pixels = policy_uses_pixels self.policy_uses_task_params = policy_uses_task_params self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs self.freq_saving = freq_saving if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels, policy_uses_task_params=policy_uses_task_params, concat_task_params_to_policy_obs=concat_task_params_to_policy_obs ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, policy_uses_pixels=self.policy_uses_pixels, policy_uses_task_params=self.policy_uses_task_params, concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.do_not_train = do_not_train self.num_episodes = 0 self.max_num_episodes = max_num_episodes if max_num_episodes is not None else float('inf') self.no_terminal = no_terminal def train(self, start_epoch=0): self.pretrain() if start_epoch == 0: params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) self.training_mode(False) self._n_env_steps_total = start_epoch * self.num_env_steps_per_epoch gt.reset() gt.set_def_unique(False) self.train_online(start_epoch=start_epoch) def pretrain(self): """ Do anything before the main training phase. """ pass def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): # we are assuming that if it's a dict then it has # pixels and obs, and maybe obs_task_params if isinstance(self.obs_space, Dict): if self.policy_uses_pixels: agent_obs = observation['pixels'] else: agent_obs = observation['obs'] else: agent_obs = observation if self.policy_uses_task_params: task_params = observation['obs_task_params'] if self.concat_task_params_to_policy_obs: agent_obs = np.concatenate((agent_obs, task_params), -1) else: agent_obs = {'obs': agent_obs, 'obs_task_params': task_params} action, agent_info = self._get_action_and_info( agent_obs, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action) ) if self.no_terminal: terminal = False self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len(self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob gt.stamp('sample') if not self.do_not_train: self._try_to_train() gt.stamp('train') if self.num_episodes > self.max_num_episodes: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() return self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch() def _try_to_train(self): if self._can_train(): self.training_mode(True) for i in range(self.num_updates_per_train_call): self._do_training() self._n_train_steps_total += 1 self.training_mode(False) def _try_to_eval(self, epoch): if epoch % self.freq_saving == 0: logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) if epoch % self.freq_saving == 0: params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: # print('$$$$$$$$$$$$$$$') # print(table_keys) # print('\n'*4) # print(self._old_table_keys) # print('$$$$$$$$$$$$$$$') # print(set(table_keys) - set(self._old_table_keys)) # print(set(self._old_table_keys) - set(table_keys)) assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration." ) self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ return ( len(self._exploration_paths) > 0 and self.replay_buffer.num_steps_can_sample() >= self.batch_size ) def _can_train(self): return self.replay_buffer.num_steps_can_sample() >= self.batch_size def _get_action_and_info(self, observation): """ Get an action to take in the environment. :param observation: :return: """ self.exploration_policy.set_num_steps_total(self._n_env_steps_total) return self.exploration_policy.get_action( observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format( time.time() - self._epoch_start_time )) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): self.num_episodes += 1 self.exploration_policy.reset() return self.training_env.reset() def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for ( ob, action, reward, next_ob, terminal, agent_info, env_info ) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked() ) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict( epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def cuda(self): """ Turn cuda on. :return: """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
def obtain_eval_samples(self, epoch, mode='meta_train'): self.training_mode(False) self.policy.eval() if mode == 'meta_train': params_samples = self.train_task_params_sampler.sample_unique(self.num_tasks_per_eval) else: params_samples = self.test_task_params_sampler.sample_unique(self.num_tasks_per_eval) all_eval_tasks_paths = [] eval_task_num = -1 for task_params, obs_task_params in params_samples: eval_task_num += 1 saved_task_gif = False cur_eval_task_paths = [] if mode == 'meta_train': self.env = self.training_env_getter(obs_task_params) else: self.env = self.test_env_getter(obs_task_params) self.env.reset() task_identifier = self.env.task_identifier for _ in range(self.num_diff_context_per_eval_task): eval_policy, context = self.get_eval_policy(task_identifier, mode=mode, return_context=True) for _ in range(self.num_eval_trajs_per_post_sample): cur_eval_path_builder = PathBuilder() observation = self.env.reset() # from scipy.misc import imsave # imsave('plots/junk_vis/val_check_obtain_eval.png', observation['image'].transpose(1,2,0)) # if mode == 'meta_test': # 1/0 terminal = False while (not terminal) and len(cur_eval_path_builder) < self.max_path_length: agent_obs = observation action, agent_info = self._get_action_and_info(agent_obs) # print(self.env) # print(action) next_ob, raw_reward, terminal, env_info = (self.env.step(action)) if self.no_terminal: terminal = False reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob if terminal and self.wrap_absorbing: raise NotImplementedError("I think they used 0 actions for this") cur_eval_path_builder.add_all( observations=next_ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) if len(cur_eval_path_builder) > 0: cur_eval_task_paths.append( cur_eval_path_builder.get_all_stacked() ) if not saved_task_gif: saved_task_gif = True if eval_task_num < 2: path = cur_eval_task_paths[-1] gif_frames = [d["image"]for d in path["observations"]] for frame_num, frame in enumerate(gif_frames): if frame_num % 4 == 3: imsave(osp.join(self.log_dir, mode+'task_%d_frame_%d.png'%(eval_task_num, frame_num)), frame.transpose(1,2,0)) # print(gif_frames) # for img in gif_frames: # print(np.max(img), np.min(img)) # write_gif(gif_frames, osp.join(self.log_dir, mode+'_%d.gif'%eval_task_num) , fps=20) if self.easy_context or self.last_image_is_context: context_img = ptu.get_numpy(context)[0].transpose(1,2,0) imsave(osp.join(self.log_dir, mode+'task_%d_context_%d.png'%(eval_task_num, eval_task_num)), context_img) if self.using_all_context: context_img = ptu.get_numpy(context['image'][0,-1]).transpose(1,2,0) imsave(osp.join(self.log_dir, mode+'task_%d_context_%d.png'%(eval_task_num, eval_task_num)), context_img) print('Saved the gifs') all_eval_tasks_paths.extend(cur_eval_task_paths) # flatten the list of lists self.policy.train() return all_eval_tasks_paths
def take_step_in_env_per_thread(pid, queue, env, policy, render, reward_scale, steps, max_path_length, n_env_steps_total): set_seed(pid) n_rollouts_total = 0 current_path_builder = PathBuilder() exploration_paths = [] replay_samples = { 'observations': [], 'actions': [], 'rewards': [], 'next_observations': [], 'terminals': [], 'agent_infos': [], 'env_infos': [], } policy.reset() observation = env.reset() policy.set_num_steps_total(n_env_steps_total) for _ in range(steps): action, agent_info = policy.get_action(observation) if pid == 0 and render: env.render() next_ob, raw_reward, terminal, env_info = env.step(action) reward = np.array([raw_reward * reward_scale]) terminal = np.array([terminal]) replay_samples['observations'].append(observation) replay_samples['actions'].append(action) replay_samples['rewards'].append(reward) replay_samples['next_observations'].append(next_ob) replay_samples['terminals'].append(terminal) replay_samples['agent_infos'].append(agent_info) replay_samples['env_infos'].append(env_info) current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) if terminal or len(current_path_builder) >= max_path_length: # cannot let replay buffer terminate episode n_rollouts_total += 1 if len(current_path_builder) > 0: exploration_paths.append( current_path_builder.get_all_stacked()) current_path_builder = PathBuilder() policy.reset() observation = env.reset() else: observation = next_ob if queue is None: return exploration_paths, replay_samples, n_rollouts_total else: queue.put([pid, exploration_paths, replay_samples, n_rollouts_total])
def fill_buffer(buffer, meta_env, expert, expert_policy_specs, task_params_sampler, num_rollouts_per_task, max_path_length, no_terminal=False, policy_is_scripted=False, render=False, check_for_success=False, wrap_absorbing=False, subsample_factor=1, deterministic=True): expert_uses_pixels = expert_policy_specs['policy_uses_pixels'] expert_uses_task_params = expert_policy_specs['policy_uses_task_params'] # hack if 'concat_task_params_to_policy_obs' in expert_policy_specs: concat_task_params_to_policy_obs = expert_policy_specs[ 'concat_task_params_to_policy_obs'] else: concat_task_params_to_policy_obs = False # this is something for debugging few shot fetch demos # first_complete_list = [] for task_params, obs_task_params in task_params_sampler: # print('Doing Task {}...'.format(task_params)) debug_stats = [] meta_env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = meta_env.task_identifier num_rollouts_completed = 0 while num_rollouts_completed < num_rollouts_per_task: cur_rollout_rewards = 0 print('\tRollout %d...' % num_rollouts_completed) cur_path_builder = PathBuilder() observation = meta_env.reset(task_params=task_params, obs_task_params=obs_task_params) if policy_is_scripted: policy = expert policy.reset(meta_env) else: if isinstance(meta_env, AntLinearClassifierEnv): policy = expert.get_exploration_policy( meta_env.targets[meta_env.true_label]) # print(meta_env.true_label) if deterministic: policy.deterministic = True elif isinstance(meta_env, Walker2DRandomDynamicsEnv): # print('WalkerEnv') policy = expert.get_exploration_policy(obs_task_params) if deterministic: # print('deterministic') policy = MakeDeterministic(policy) else: policy = expert.get_exploration_policy(obs_task_params) if deterministic: policy.deterministic = True terminal = False subsample_mod = randint(0, subsample_factor - 1) step_num = 0 rollout_debug = [] while (not terminal) and step_num < max_path_length: if render: meta_env.render() if isinstance(meta_env.observation_space, Dict): if expert_uses_pixels: agent_obs = observation['pixels'] else: agent_obs = observation['obs'] if isinstance(meta_env, AntLinearClassifierEnv): if meta_env.use_relative_pos: agent_obs = np.concatenate([ agent_obs[:-12], meta_env.get_body_com("torso").flat ]).copy() else: agent_obs = agent_obs[:-12] else: agent_obs = observation if expert_uses_task_params: if concat_task_params_to_policy_obs: agent_obs = np.concatenate( (agent_obs, obs_task_params), -1) # else: # agent_obs = {'obs': agent_obs, 'obs_task_params': obs_task_params} if policy_is_scripted: action, agent_info = policy.get_action( agent_obs, meta_env, len(cur_path_builder)) else: action, agent_info = policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = ( meta_env.step(action)) # raw_reward = -1.0 * env_info['run_cost'] # raw_reward = env_info['vel'] cur_rollout_rewards += raw_reward # if step_num < 200: cur_rollout_rewards += raw_reward # rollout_debug.append(env_info['l2_dist']) if no_terminal: terminal = False if wrap_absorbing: terminal_array = np.array([False]) else: terminal_array = np.array([terminal]) reward = raw_reward reward = np.array([reward]) if step_num % subsample_factor == subsample_mod: cur_path_builder.add_all(observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal_array, absorbing=np.array([0.0, 0.0]), agent_infos=agent_info, env_infos=env_info) observation = next_ob step_num += 1 if terminal and wrap_absorbing: ''' If we wrap absorbing states, two additional transitions must be added: (s_T, s_abs) and (s_abs, s_abs). In Disc Actor Critic paper they make s_abs be a vector of 0s with last dim set to 1. Here we are going to add the following: ([next_ob,0], random_action, [next_ob, 1]) and ([next_ob,1], random_action, [next_ob, 1]) This way we can handle varying types of terminal states. ''' # next_ob is the absorbing state # for now just sampling random action cur_path_builder.add_all( observations=next_ob, actions=action, # the reward doesn't matter rewards=0.0, next_observations=next_ob, terminals=np.array([False]), absorbing=np.array([0.0, 1.0]), agent_infos=agent_info, env_infos=env_info) cur_path_builder.add_all( observations=next_ob, actions=action, # the reward doesn't matter rewards=0.0, next_observations=next_ob, terminals=np.array([False]), absorbing=np.array([1.0, 1.0]), agent_infos=agent_info, env_infos=env_info) # if necessary check if it was successful if check_for_success: was_successful = np.sum([ e_info['is_success'] for e_info in cur_path_builder['env_infos'] ]) > 0 if was_successful: print('\t\tSuccessful') else: print('\t\tNot Successful') if (check_for_success and was_successful) or (not check_for_success): for timestep in range(len(cur_path_builder)): buffer.add_sample( cur_path_builder['observations'][timestep], cur_path_builder['actions'][timestep], cur_path_builder['rewards'][timestep], cur_path_builder['terminals'][timestep], cur_path_builder['next_observations'][timestep], task_id, agent_info=cur_path_builder['agent_infos'][timestep], env_info=cur_path_builder['env_infos'][timestep], absorbing=cur_path_builder['absorbing'][timestep]) buffer.terminate_episode(task_id) num_rollouts_completed += 1 print('\t\tReturn: %.2f' % (cur_rollout_rewards)) debug_stats.append(cur_rollout_rewards) # print('Min L2: %.3f' % np.min(rollout_debug)) # print(policy.first_time_all_complete) # first_complete_list.append(expert_policy.first_time_all_complete) # print(np.histogram(first_complete_list, bins=100)) print('%.1f +/- %.1f' % (np.mean(debug_stats), np.std(debug_stats))) print('\n\n')
class MetaRLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, policy, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_tasks_sample=100, num_steps_per_task=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, #1000000, reward_scale=1, train_embedding_source='posterior_only', eval_embedding_source='initial_pool', eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, obs_emb_dim=0): """ Base class for Meta RL Algorithms :param env: training env :param policy: policy that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval :param meta_batch: number of tasks used for meta-update :param num_iterations: number of meta-updates taken :param num_train_steps_per_itr: number of meta-updates performed per iteration :param num_tasks_sample: number of train tasks to sample to collect data for :param num_steps_per_task: number of transitions to collect per task :param num_evals: number of independent evaluation runs, with separate task encodings :param num_steps_per_eval: number of transitions to sample for evaluation :param batch_size: size of batches used to compute RL update :param embedding_batch_size: size of batches used to compute embedding :param embedding_mini_batch_size: size of batch used for encoder update :param max_path_length: max episode length :param discount: :param replay_buffer_size: max replay buffer size :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: """ self.env = env self.policy = policy self.exploration_policy = policy # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_tasks_sample = num_tasks_sample self.num_steps_per_task = num_steps_per_task self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = min( int(replay_buffer_size / (len(train_tasks))), 1000) self.reward_scale = reward_scale self.train_embedding_source = train_embedding_source self.eval_embedding_source = eval_embedding_source # TODO: add options for computing embeddings on train tasks too self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_sampler = InPlacePathSampler( env=env, policy=policy, max_samples=self.num_steps_per_eval, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update # - testing encoder self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, state_dim=obs_emb_dim) self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, state_dim=obs_emb_dim) self.eval_enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.eval_tasks, state_dim=obs_emb_dim) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def make_exploration_policy(self, policy): return policy def make_eval_policy(self, policy): return policy def sample_task(self, is_eval=False): ''' sample task randomly ''' if is_eval: idx = np.random.randint(len(self.eval_tasks)) else: idx = np.random.randint(len(self.train_tasks)) return idx def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() self.train_obs = self._start_new_rollout() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: print('train task', idx) self.task_idx = idx self.env.reset_task(idx) self.collect_data_sampling_from_prior( num_samples=self.max_path_length * 10, resample_z_every_n=self.max_path_length, eval_task=False) """ for idx in self.eval_tasks: self.task_idx = idx self.env.reset_task(idx) # TODO: make number of initial trajectories a parameter self.collect_data_sampling_from_prior(num_samples=self.max_path_length * 20, resample_z_every_n=self.max_path_length, eval_task=True) """ # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) # TODO: there may be more permutations of sampling/adding to encoding buffer we may wish to try if self.train_embedding_source == 'initial_pool': # embeddings are computed using only the initial pool of data # sample data from posterior to train RL algorithm self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=False) elif self.train_embedding_source == 'posterior_only': self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, eval_task=False, add_to_enc_buffer=True) elif self.train_embedding_source == 'online_exploration_trajectories': # embeddings are computed using only data collected using the prior # sample data from posterior to train RL algorithm self.enc_replay_buffer.task_buffers[idx].clear() # resamples using current policy, conditioned on prior self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, add_to_enc_buffer=True) self.env.reset_task(idx) self.collect_data_from_task_posterior( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=False, viz=True) elif self.train_embedding_source == 'online_on_policy_trajectories': # sample from prior, then sample more from the posterior # embeddings computed from both prior and posterior data self.enc_replay_buffer.task_buffers[idx].clear() self.collect_data_online( idx=idx, num_samples=self.num_steps_per_task, add_to_enc_buffer=True) else: raise Exception( "Invalid option for computing train embedding {}". format(self.train_embedding_source)) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices, train_step) self._n_train_steps_total += 1 gt.stamp('train') #self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch() def pretrain(self): """ Do anything before the main training phase. """ pass def sample_z_from_prior(self): """ Samples z from the prior distribution, which can be either a delta function at 0 or a standard Gaussian depending on whether we use the information bottleneck. :return: latent z as a Numpy array """ pass def sample_z_from_posterior(self, idx, eval_task): """ Samples z from the posterior distribution given data from task idx, where data comes from the encoding buffer :param idx: task idx from which to compute the posterior from :param eval_task: whether or not the task is an eval task :return: latent z as a Numpy array """ pass # TODO: maybe find a better name for resample_z_every_n? def collect_data_sampling_from_prior(self, num_samples=1, resample_z_every_n=None, eval_task=False, add_to_enc_buffer=True): # do not resample z if resample_z_every_n is None if resample_z_every_n is None: self.policy.clear_z() self.collect_data(self.policy, num_samples=num_samples, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer) else: # collects more data in batches of resample_z_every_n until done while num_samples > 0: self.collect_data_sampling_from_prior( num_samples=min(resample_z_every_n, num_samples), resample_z_every_n=None, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer) num_samples -= resample_z_every_n def collect_data_from_task_posterior(self, idx, num_samples=1, resample_z_every_n=None, eval_task=False, add_to_enc_buffer=True, viz=False): # do not resample z if resample_z_every_n is None if resample_z_every_n is None: self.sample_z_from_posterior(idx, eval_task=eval_task) self.collect_data(self.policy, num_samples=num_samples, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer, viz=viz) else: # collects more data in batches of resample_z_every_n until done while num_samples > 0: self.collect_data_from_task_posterior( idx=idx, num_samples=min(resample_z_every_n, num_samples), resample_z_every_n=None, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer, viz=viz) num_samples -= resample_z_every_n # split number of prior and posterior samples def collect_data_online(self, idx, num_samples, eval_task=False, add_to_enc_buffer=True): self.collect_data_sampling_from_prior( num_samples=num_samples, resample_z_every_n=self.max_path_length, eval_task=eval_task, add_to_enc_buffer=True) self.env.reset_task(idx) self.collect_data_from_task_posterior( idx=idx, num_samples=num_samples, resample_z_every_n=self.max_path_length, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer, viz=True) # TODO: since switching tasks now resets the environment, we are not correctly handling episodes terminating # correctly. We also aren't using the episodes anywhere, but we should probably change this to make it gather paths # until we have more samples than num_samples, to make sure every episode cleanly terminates when intended. # @profile def collect_data(self, agent, num_samples=1, max_resets=None, eval_task=False, add_to_enc_buffer=True, viz=False): ''' collect data from current env in batch mode with given policy ''' images = [] # if num_samples == 50: # import pdb; pdb.set_trace() env_time = self.env.time rews = [] terms = [] n_resets = 0 for _ in range(num_samples): action, agent_info = self._get_action_and_info( agent, self.train_obs) if self.render: self.env.render() next_ob, raw_reward, terminal, env_info = (self.env.step(action)) if viz: images.append(next_ob) # vis.image(next_ob[-1]) reward = raw_reward rews += [reward] terms += [terminal] terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( self.task_idx, np.concatenate( [self.train_obs.flatten()[None], agent_info['obs_emb']], axis=-1), action, reward, np.concatenate([ next_ob.flatten()[None], torch.zeros(agent_info['obs_emb'].shape) ], axis=-1), terminal, eval_task=eval_task, add_to_enc_buffer=add_to_enc_buffer, agent_info=agent_info, env_info=env_info, ) # TODO USE masking here to handle the terminal episodes # print(len(self._current_path_builder)) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending(eval_task=eval_task) self.train_obs = self._start_new_rollout() n_resets += 1 if _ + self.max_path_length > num_samples - 1: break if max_resets is not None and n_resets > max_resets: break else: # print((next_ob - self.train_obs).sum()) # self.train_obs = None self.train_obs = next_ob if viz and np.random.random() < 0.3: # import pdb; pdb.set_trace() vis.images(np.stack(images)[:, -1:]) vis.line(np.array([rews, terms]).T, opts=dict(width=400, height=320)) vis.text('', opts=dict(width=10000, height=5)) # vis.video(np.stack(images)) if not eval_task: self._n_env_steps_total += num_samples gt.stamp('sample') def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ # import pdb; pdb.set_trace() return ( # len(self._exploration_paths) > 0 # and self.replay_buffer.num_steps_can_sample(self.task_idx) >= self.batch_size) def _can_train(self): return all([ self.replay_buffer.num_steps_can_sample(idx) >= self.batch_size for idx in self.train_tasks ]) def _get_action_and_info(self, agent, observation): """ Get an action to take in the environment. :param observation: :return: """ agent.set_num_steps_total(self._n_env_steps_total) return agent.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() def _start_new_rollout(self): ret = self.env.reset() if isinstance(ret, tuple): ret = ret[0] return ret # not used def _handle_path(self, path): """ Naive implementation: just loop through each transition. :param path: :return: """ for (ob, action, reward, next_ob, terminal, agent_info, env_info) in zip( path["observations"], path["actions"], path["rewards"], path["next_observations"], path["terminals"], path["agent_infos"], path["env_infos"], ): self._handle_step( ob.reshape(-1), action, reward, next_ob.reshape(-1), terminal, agent_info=agent_info, env_info=env_info, ) self._handle_rollout_ending() def _handle_step( self, task_idx, observation, action, reward, next_observation, terminal, agent_info, env_info, eval_task=False, add_to_enc_buffer=True, ): """ Implement anything that needs to happen after every step :return: """ self._current_path_builder.add_all( task=task_idx, observations=observation, actions=action, rewards=reward, next_observations=next_observation, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) if eval_task: self.eval_enc_replay_buffer.add_sample( task=task_idx, observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) else: self.replay_buffer.add_sample( task=task_idx, observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) if add_to_enc_buffer: self.enc_replay_buffer.add_sample( task=task_idx, observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) def _handle_rollout_ending(self, eval_task=False): """ Implement anything that needs to happen after every rollout. """ if eval_task: self.eval_enc_replay_buffer.terminate_episode(self.task_idx) else: self.replay_buffer.terminate_episode(self.task_idx) self.enc_replay_buffer.terminate_episode(self.task_idx) self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: # and False: # self._exploration_paths.append( # self._current_path_builder.get_all_stacked() # ) self._current_path_builder = PathBuilder() def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
def collect_one_step( self, max_path_length, discard_incomplete_paths, steps_collector: PathBuilder = None, random=False, ): if self._obs is None: self._start_new_rollout() if random: actions = [self._env.action_space.sample() for _ in range(self._env_num)] else: actions = self._policy.get_actions(self._obs) next_obs, rewards, terminals, env_infos = self._env.step(actions) if self._render: self._env.render(**self._render_kwargs) # unzip vectorized data for env_idx, ( path_builder, next_ob, action, reward, terminal, env_info, ) in enumerate(zip( self._current_path_builders, next_obs, actions, rewards, terminals, env_infos, )): obs = self._obs[env_idx].copy() terminal = np.array([terminal]) reward = np.array([reward]) # store path obs path_builder.add_all( observations=obs, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos={}, # policy.get_actions doesn't return agent_info env_infos=env_info, ) if steps_collector is not None: steps_collector.add_all( observations=obs, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos={}, # policy.get_actions doesn't return agent_info env_infos=env_info, ) self._obs[env_idx] = next_ob if terminal or len(path_builder) >= max_path_length: self._handle_rollout_ending(path_builder, max_path_length, discard_incomplete_paths) self._start_new_rollout(env_idx)
def obtain_eval_samples(self, epoch, mode='meta_train'): self.training_mode(False) if mode == 'meta_train': params_samples = self.train_task_params_sampler.sample_unique( self.num_tasks_per_eval) else: params_samples = self.test_task_params_sampler.sample_unique( self.num_tasks_per_eval) all_eval_tasks_paths = [] for task_params, obs_task_params in params_samples: cur_eval_task_paths = [] self.env.reset(task_params=task_params, obs_task_params=obs_task_params) task_identifier = self.env.task_identifier for _ in range(self.num_diff_context_per_eval_task): eval_policy = self.get_eval_policy(task_identifier, mode=mode) for _ in range(self.num_eval_trajs_per_post_sample): cur_eval_path_builder = PathBuilder() observation = self.env.reset( task_params=task_params, obs_task_params=obs_task_params) terminal = False while (not terminal) and len( cur_eval_path_builder) < self.max_path_length: if isinstance(self.obs_space, Dict): if self.policy_uses_pixels: agent_obs = observation['pixels'] else: agent_obs = observation['obs'] else: agent_obs = observation action, agent_info = eval_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = ( self.env.step(action)) if self.no_terminal: terminal = False reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier) observation = next_ob if terminal and self.wrap_absorbing: raise NotImplementedError( "I think they used 0 actions for this") cur_eval_path_builder.add_all( observations=next_ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier) if len(cur_eval_path_builder) > 0: cur_eval_task_paths.append( cur_eval_path_builder.get_all_stacked()) all_eval_tasks_paths.extend(cur_eval_task_paths) # flatten the list of lists return all_eval_tasks_paths
def fill_buffer( buffer, env, expert_policy, num_rollouts, max_path_length, no_terminal=False, policy_is_scripted=False, render=False, render_kwargs={}, check_for_success=False, wrap_absorbing=False, subsample_factor=1, ): num_rollouts_completed = 0 total_rewards = 0.0 while num_rollouts_completed < num_rollouts: print('Rollout %d...' % num_rollouts_completed) cur_path_builder = PathBuilder() observation = env.reset() if policy_is_scripted: expert_policy.reset(env) # if subsampling what offset do you want to use subsample_mod = randint(0, subsample_factor - 1) rewards_for_rollout = 0.0 printed_target_dist = False step_num = 0 terminal = False while (not terminal) and step_num < max_path_length: if render: env.render(**render_kwargs) # get the action if policy_is_scripted: action, agent_info = expert_policy.get_action( observation, env, len(cur_path_builder)) else: action, agent_info = expert_policy.get_action(observation) next_ob, reward, terminal, env_info = env.step(action) if no_terminal: terminal = False terminal_array = np.array([terminal]) rewards_for_rollout += reward reward = np.array([reward]) if step_num % subsample_factor == subsample_mod: cur_path_builder.add_all(observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal_array, absorbing=np.array([0.0, 0.0]), agent_infos=agent_info, env_infos=env_info) observation = next_ob step_num += 1 print('\tNum Steps: %d' % step_num) print('\tReturns: %.2f' % rewards_for_rollout) # if necessary check if it was successful if check_for_success: was_successful = np.sum([ e_info['is_success'] for e_info in cur_path_builder['env_infos'] ]) > 0 if was_successful: print('\tSuccessful') else: print('\tNot Successful') # add the path to the buffer if (check_for_success and was_successful) or (not check_for_success): for timestep in range(len(cur_path_builder)): buffer.add_sample( cur_path_builder['observations'][timestep], cur_path_builder['actions'][timestep], cur_path_builder['rewards'][timestep], cur_path_builder['terminals'][timestep], cur_path_builder['next_observations'][timestep], agent_info=cur_path_builder['agent_infos'][timestep], env_info=cur_path_builder['env_infos'][timestep], absorbing=cur_path_builder['absorbing'][timestep]) buffer.terminate_episode() num_rollouts_completed += 1 total_rewards += rewards_for_rollout print('\nAverage Episode Return: %f\n' % (total_rewards / num_rollouts_completed))