def update_segment_buffer(self, mb_obs, mb_rewards, mb_dones): # Segments are only generated from the first worker. # Empirically, this seems to work fine. e0_obs = mb_obs[0] e0_rew = mb_rewards[0] e0_dones = mb_dones[0] assert_equal(e0_obs.shape[0], self.nsteps) # TODO make this general to nstack parameter assert(e0_obs.shape[-1] % 4 == 0) assert_equal(e0_rew.shape[0], self.nsteps) assert_equal(e0_dones.shape[0], self.nsteps) # TODO generalize across num_channels converted_image = cv2.cvtColor(e0_obs[0][:, :, -3:], cv2.COLOR_RGB2BGR) cv2.imwrite("eo_obs_segment_buffer.png", converted_image) for step in range(self.nsteps): self.segment.append(np.copy(e0_obs[step]), np.copy(e0_rew[step])) if len(self.segment) == 40 or e0_dones[step]: while len(self.segment) < 40: # Pad to 25 steps long so that all segments in the batch # have the same length. # Note that the reward predictor needs the full frame # stack, so we send all frames. self.segment.append(e0_obs[step], 0) self.segment.finalise() try: self.seg_pipe.put(self.segment, block=False) except queue.Full: # If the preference interface has a backlog of segments # to deal with, don't stop training the agents. Just drop # the segment and keep on going. pass self.segment = Segment()
def _update_episode_segment(self, obs, reward, done): """ Takes observation from most recent environment step and adds it to existing segment. If segment has reached desired length, finalize it and send it to the PrefInterface via seg_pipe :param obs: A (possibly stacked) observation from the underlying environment :param reward: Underlying environment reward (used for synthetic preferences) :param done: Whether the episode has terminated, in which case we should pad the rest of the segment and then start a new one :return: """ if self.obs_transform_func is not None: obs = self.obs_transform_func(obs) self.episode_segment.append(np.copy(obs), np.copy(reward)) if done: while len(self.episode_segment) < self.segment_length: self.episode_segment.append(np.copy(obs), 0) if len(self.episode_segment) == self.segment_length: self.segments_collected += 1 self.episode_segment.finalise() try: self.seg_pipe.put(self.episode_segment, block=False) except queue.Full: # If the preference interface has a backlog of segments # to deal with, don't stop training the agents. Just drop # the segment and keep on going. pass self.episode_segment = Segment()
def __init__(self, env, model, nsteps, nstack, gamma, gen_segments, seg_pipe, reward_predictor, episode_vid_queue): self.env = env self.model = model nh, nw, nc = env.observation_space.shape nenv = env.num_envs # CHANGE: In A2C, this is defined as being of shape # (n_env*n_steps, nh, nw, nc) # Assuming that env.observation_space.shape = (nh, nw, nc) self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) # CHANGE: In A2C, this is defined as being of shape # (n__env, nh, nw, nc) According to the same observation space assumption self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) # The first stack of 4 frames: the first 3 frames are zeros, # with the last frame coming from env.reset(). print("Got to before reset") print("Shape of self.obs: {}".format(self.obs.shape)) obs = env.reset() print("Finished env reset") self.update_obs(obs) print("Finished updating obs") self.gamma = gamma self.nsteps = nsteps self.states = model.initial_state self.dones = [False for _ in range(nenv)] self.gen_segments = gen_segments self.segment = Segment() self.seg_pipe = seg_pipe self.orig_reward = [0 for _ in range(nenv)] self.reward_predictor = reward_predictor self.episode_frames = [] self.episode_vid_queue = episode_vid_queue print("Got to end of Runner creation")
def send_segments(n_segments, seg_pipe): frame_stack = np.zeros((84, 84, 4)) for i in range(n_segments): segment = Segment() for _ in range(25): segment.append(frame=frame_stack, reward=0) segment.finalise(seg_id=i) seg_pipe.put(segment)
def start_segment_collection(self): self.collecting_segments = True self.episode_segment = Segment()
def __init__(self, env: Env, reward_predictor_network: Callable = net_cnn, train_reward: bool = True, collect_prefs: bool = True, segment_length: int = 40, mp_context: str = 'spawn', prefs_dir: str = None, log_dir: str = "drlhp_logs/", max_prefs_in_db: int = 10000, obs_transform_func: Callable = None, n_initial_training_steps: int = 50, n_initial_prefs: int = 40, pretrained_reward_predictor_dir: str = None, reward_predictor_ckpt_interval: int = 10, reward_predictor_refresh_interval: int = 10, validation_interval: int = 10, reward_database_refresh_interval: int = 1, synthetic_prefs: bool = True, max_pref_interface_segs: int = 25, zoom_ratio: int = 4, channels: int = 3, env_wrapper_log_level: int = logging.INFO, reward_predictor_log_level: int = logging.INFO, pref_interface_log_level: int = logging.INFO ): """ A Wrapper that collects segments from the observations returned through its internal env's .step() function, and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences. It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as training examples. When a minimum number of training steps has been reached, it loads the trained reward predictor network and starts using that as the returned reward, rather than underlying environment reward :param env: Underlying environment :param reward_predictor_network: Callable mapping between input obs and reward scalar :param train_reward: A boolean specifying whether or not the env should train a reward predictor :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist :param log_dir: An string path specifying where logs and artifacts from this run should be saved :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs :param obs_transform_func: An optional transformation function to transform the observation returned by our internal environment into the observation that should be concatenated to form our segments (for example, if the underlying environment is a Dict space, your transform func could be obs['pov']) :param n_initial_training_steps: How many training steps should be performed before we switch to using a trained reward model as our returned environment reward :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor is saved, if one exists :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the reward predictor used by the env to calculate reward :param validation_interval: Interval of reward predictor training steps on which to perform validation :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the PrefDBs used for training/validation :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically checkpoint the reward prediction model :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False, we query for human preferences using a GUI interface :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by the preference interface :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using synthetic preferences) :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from observation space shape because common usage involves a FrameStack wrapper, which will stack frames along the channel dimension) :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function :param pref_interface_log_level: The log level of the logger used by the preference interface """ # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic super(HumanPreferencesEnvWrapper, self).__init__(env) self.logger = logging.getLogger("HumanPreferencesEnvWrapper") self.logger.setLevel(env_wrapper_log_level) self.reward_predictor_log_level = reward_predictor_log_level self.pref_interface_log_level = pref_interface_log_level self.obs_shape = env.observation_space.shape self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs, max_segs=max_pref_interface_segs, log_dir=log_dir, channels=channels, zoom=zoom_ratio) # Save a bunch of init parameters as wrapper properties self.synthetic_prefs = synthetic_prefs self.mp_context = mp_context self.train_reward = train_reward self.collect_prefs = collect_prefs self.segment_length = segment_length self.reward_predictor_network = reward_predictor_network self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir self.obs_transform_func = obs_transform_func self.prefs_dir = prefs_dir self.max_prefs = max_prefs_in_db self.n_initial_prefs = n_initial_prefs self.n_initial_training_steps = n_initial_training_steps self.log_dir = log_dir self.ckpt_interval = reward_predictor_ckpt_interval self.reward_predictor_refresh_interval = reward_predictor_refresh_interval self.val_interval = validation_interval self.reward_database_refresh_interval = reward_database_refresh_interval # Setting counter and status variables to initial values self.segments_collected = 0 self.reward_predictor_n_train = 0 self.using_reward_from_predictor = False self.force_return_true_reward = False self.collecting_segments = True self.last_true_reward = None # Create empty observation stack and new segment self.recent_obs_stack = [] self.episode_segment = Segment() self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints') # Create Queues and Values to handle multiprocessing communication # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the # TODO ground due to timing issues self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5) self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1) self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0) self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0) self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0) self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0) # Create placeholder parameters for things that we'll initialize later self.pref_interface_proc = None self.reward_training_proc = None self.pref_buffer = None self.reward_predictor = None # If we want to collect preferences, we need to start a PrefInterface-running process if self.collect_prefs: self._start_pref_interface() # If we want to save preferences and/or train a reward model, we need to start a reward predictor training # process (which also handles creating a PrefDB in which preferences are stored/saved) if self.train_reward or self.collect_prefs: self._start_reward_predictor_training()
class HumanPreferencesEnvWrapper(Wrapper): def __init__(self, env: Env, reward_predictor_network: Callable = net_cnn, train_reward: bool = True, collect_prefs: bool = True, segment_length: int = 40, mp_context: str = 'spawn', prefs_dir: str = None, log_dir: str = "drlhp_logs/", max_prefs_in_db: int = 10000, obs_transform_func: Callable = None, n_initial_training_steps: int = 50, n_initial_prefs: int = 40, pretrained_reward_predictor_dir: str = None, reward_predictor_ckpt_interval: int = 10, reward_predictor_refresh_interval: int = 10, validation_interval: int = 10, reward_database_refresh_interval: int = 1, synthetic_prefs: bool = True, max_pref_interface_segs: int = 25, zoom_ratio: int = 4, channels: int = 3, env_wrapper_log_level: int = logging.INFO, reward_predictor_log_level: int = logging.INFO, pref_interface_log_level: int = logging.INFO ): """ A Wrapper that collects segments from the observations returned through its internal env's .step() function, and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences. It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as training examples. When a minimum number of training steps has been reached, it loads the trained reward predictor network and starts using that as the returned reward, rather than underlying environment reward :param env: Underlying environment :param reward_predictor_network: Callable mapping between input obs and reward scalar :param train_reward: A boolean specifying whether or not the env should train a reward predictor :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist :param log_dir: An string path specifying where logs and artifacts from this run should be saved :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs :param obs_transform_func: An optional transformation function to transform the observation returned by our internal environment into the observation that should be concatenated to form our segments (for example, if the underlying environment is a Dict space, your transform func could be obs['pov']) :param n_initial_training_steps: How many training steps should be performed before we switch to using a trained reward model as our returned environment reward :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor is saved, if one exists :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the reward predictor used by the env to calculate reward :param validation_interval: Interval of reward predictor training steps on which to perform validation :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the PrefDBs used for training/validation :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically checkpoint the reward prediction model :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False, we query for human preferences using a GUI interface :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by the preference interface :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using synthetic preferences) :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from observation space shape because common usage involves a FrameStack wrapper, which will stack frames along the channel dimension) :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function :param pref_interface_log_level: The log level of the logger used by the preference interface """ # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic super(HumanPreferencesEnvWrapper, self).__init__(env) self.logger = logging.getLogger("HumanPreferencesEnvWrapper") self.logger.setLevel(env_wrapper_log_level) self.reward_predictor_log_level = reward_predictor_log_level self.pref_interface_log_level = pref_interface_log_level self.obs_shape = env.observation_space.shape self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs, max_segs=max_pref_interface_segs, log_dir=log_dir, channels=channels, zoom=zoom_ratio) # Save a bunch of init parameters as wrapper properties self.synthetic_prefs = synthetic_prefs self.mp_context = mp_context self.train_reward = train_reward self.collect_prefs = collect_prefs self.segment_length = segment_length self.reward_predictor_network = reward_predictor_network self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir self.obs_transform_func = obs_transform_func self.prefs_dir = prefs_dir self.max_prefs = max_prefs_in_db self.n_initial_prefs = n_initial_prefs self.n_initial_training_steps = n_initial_training_steps self.log_dir = log_dir self.ckpt_interval = reward_predictor_ckpt_interval self.reward_predictor_refresh_interval = reward_predictor_refresh_interval self.val_interval = validation_interval self.reward_database_refresh_interval = reward_database_refresh_interval # Setting counter and status variables to initial values self.segments_collected = 0 self.reward_predictor_n_train = 0 self.using_reward_from_predictor = False self.force_return_true_reward = False self.collecting_segments = True self.last_true_reward = None # Create empty observation stack and new segment self.recent_obs_stack = [] self.episode_segment = Segment() self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints') # Create Queues and Values to handle multiprocessing communication # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the # TODO ground due to timing issues self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5) self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1) self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0) self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0) self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0) self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0) # Create placeholder parameters for things that we'll initialize later self.pref_interface_proc = None self.reward_training_proc = None self.pref_buffer = None self.reward_predictor = None # If we want to collect preferences, we need to start a PrefInterface-running process if self.collect_prefs: self._start_pref_interface() # If we want to save preferences and/or train a reward model, we need to start a reward predictor training # process (which also handles creating a PrefDB in which preferences are stored/saved) if self.train_reward or self.collect_prefs: self._start_reward_predictor_training() def _start_pref_interface(self): self.pref_interface_proc = mp.get_context(self.mp_context).Process(target=_run_pref_interface, daemon=True, args=(self.preference_interface, self.seg_pipe, self.pref_pipe, self.kill_pref_interface_flag, self.pref_interface_log_level)) self.pref_interface_proc.start() def _start_reward_predictor_training(self): self.reward_training_proc = mp.get_context('spawn').Process(target=_train_reward_predictor, daemon=True, args=(self.reward_predictor_network, self.train_reward, self.pretrained_reward_predictor_dir, self.obs_shape, self.pref_pipe, self.pref_db_size, self.prefs_dir, self.max_prefs, self.ckpt_interval, self.n_initial_prefs, self.reward_training_steps, self.reward_database_refresh_interval, self.val_interval, self.kill_reward_training_flag, self.save_prefs_flag, self.save_model_flag, self.log_dir, self.reward_predictor_log_level)) self.reward_training_proc.start() def _update_episode_segment(self, obs, reward, done): """ Takes observation from most recent environment step and adds it to existing segment. If segment has reached desired length, finalize it and send it to the PrefInterface via seg_pipe :param obs: A (possibly stacked) observation from the underlying environment :param reward: Underlying environment reward (used for synthetic preferences) :param done: Whether the episode has terminated, in which case we should pad the rest of the segment and then start a new one :return: """ if self.obs_transform_func is not None: obs = self.obs_transform_func(obs) self.episode_segment.append(np.copy(obs), np.copy(reward)) if done: while len(self.episode_segment) < self.segment_length: self.episode_segment.append(np.copy(obs), 0) if len(self.episode_segment) == self.segment_length: self.segments_collected += 1 self.episode_segment.finalise() try: self.seg_pipe.put(self.episode_segment, block=False) except queue.Full: # If the preference interface has a backlog of segments # to deal with, don't stop training the agents. Just drop # the segment and keep on going. pass self.episode_segment = Segment() def save_prefs(self): self.save_prefs_flag.value = 1 def save_reward_predictor(self): self.save_model_flag.value = 1 def stop_segment_collection(self): self.collecting_segments = False def start_segment_collection(self): self.collecting_segments = True self.episode_segment = Segment() def _load_reward_predictor(self, model_load_dir): if self.reward_predictor is None: self.logger.info(f"Loading reward predictor from {model_load_dir}; will use its model reward now") self.reward_predictor = RewardPredictorEnsemble( core_network=self.reward_predictor_network, log_dir=self.log_dir, batchnorm=False, dropout=0.0, lr=7e-4, obs_shape=self.obs_shape, logger=self.logger) self.reward_predictor_n_train = self.reward_training_steps.value self.reward_predictor.init_network(model_load_dir) def step(self, action): # Check whether we have only just hit the point of the model having trained for enough steps minimum_training_steps_reached = self.reward_training_steps.value >= self.n_initial_training_steps sufficiently_trained = self.reward_predictor is None and minimum_training_steps_reached # Check whether we have an existing pretrained model we've not yet loaded in pretrained_model = self.reward_predictor is None and self.pretrained_reward_predictor_dir is not None # Check whether we should update our existing reward predictor with a new one because we've done enough # training steps since we last updated should_update_model = minimum_training_steps_reached and (self.reward_training_steps.value - self.reward_predictor_n_train > self.reward_predictor_refresh_interval) # If any of these things are true, we load a model in if sufficiently_trained or pretrained_model or should_update_model: if sufficiently_trained: self.logger.info("Model is sufficiently trained, switching to it for reward") model_load_dir = self.reward_predictor_checkpoint_dir elif should_update_model: self.logger.info("Updating model used for env reward") model_load_dir = self.reward_predictor_checkpoint_dir else: model_load_dir = self.pretrained_reward_predictor_dir self.logger.info("Loading pretrained model for env reward") self._load_reward_predictor(model_load_dir) self.using_reward_from_predictor = True obs, reward, done, info = self.env.step(action) if self.collecting_segments: self._update_episode_segment(obs, reward, done) if self.reward_predictor is not None and not self.force_return_true_reward: # If we have self.force_return_true_reward set, the environment will return the true # underlying reward (meant for evaluation purposes) predicted_reward = self.reward_predictor.reward(np.array([np.array(obs)]))[0] self.last_true_reward = reward return obs, predicted_reward, done, info else: return obs, reward, done, info def switch_to_true_reward(self): if not self.using_reward_from_predictor: raise Warning("Environment has no reward predictor loaded, and is thus returning true reward") elif self.force_return_true_reward: raise Warning("Environment already returning true reward, no change") else: self.using_reward_from_predictor = False self.force_return_true_reward = True def switch_to_predicted_reward(self): """ Note: this only works to undo a prior forcing of true reward if a reward model is already loaded, it can't cause a reward model to exist if it isn't present """ if not self.force_return_true_reward: raise Warning("Environment already returning predicted reward, no change") else: self.using_reward_from_predictor = True self.force_return_true_reward = False def _cleanup_processes(self): self.logger.debug("Sending kill flags to processes") self.kill_reward_training_flag.value = 1 self.kill_pref_interface_flag.value = 1 self.logger.debug("Joining processes that are running") if self.reward_training_proc is not None: self.reward_training_proc.join() if self.pref_interface_proc is not None: self.pref_interface_proc.join() self.logger.debug("Closing seg pipe") self.seg_pipe.close() self.seg_pipe.join_thread() self.logger.debug("Closing pref pipe") self.pref_pipe.close() self.pref_pipe.join_thread() def close(self): self.logger.debug("env.close() was called") self._cleanup_processes() self.env.close()
class Runner(object): def __init__(self, env, model, nsteps, nstack, gamma, gen_segments, seg_pipe, reward_predictor, episode_vid_queue): self.env = env self.model = model nh, nw, nc = env.observation_space.shape nenv = env.num_envs # CHANGE: In A2C, this is defined as being of shape # (n_env*n_steps, nh, nw, nc) # Assuming that env.observation_space.shape = (nh, nw, nc) self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack) # CHANGE: In A2C, this is defined as being of shape # (n__env, nh, nw, nc) According to the same observation space assumption self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) # The first stack of 4 frames: the first 3 frames are zeros, # with the last frame coming from env.reset(). print("Got to before reset") print("Shape of self.obs: {}".format(self.obs.shape)) obs = env.reset() print("Finished env reset") self.update_obs(obs) print("Finished updating obs") self.gamma = gamma self.nsteps = nsteps self.states = model.initial_state self.dones = [False for _ in range(nenv)] self.gen_segments = gen_segments self.segment = Segment() self.seg_pipe = seg_pipe self.orig_reward = [0 for _ in range(nenv)] self.reward_predictor = reward_predictor self.episode_frames = [] self.episode_vid_queue = episode_vid_queue print("Got to end of Runner creation") def update_obs(self, obs): # Do frame-stacking here instead of the FrameStack wrapper to reduce # IPC overhead # TODO take more general channel values self.obs = np.roll(self.obs, shift=-3, axis=3) self.obs[:, :, :, -3:] = obs[:, :, :, 0:3] def update_segment_buffer(self, mb_obs, mb_rewards, mb_dones): # Segments are only generated from the first worker. # Empirically, this seems to work fine. e0_obs = mb_obs[0] e0_rew = mb_rewards[0] e0_dones = mb_dones[0] assert_equal(e0_obs.shape[0], self.nsteps) # TODO make this general to nstack parameter assert(e0_obs.shape[-1] % 4 == 0) assert_equal(e0_rew.shape[0], self.nsteps) assert_equal(e0_dones.shape[0], self.nsteps) # TODO generalize across num_channels converted_image = cv2.cvtColor(e0_obs[0][:, :, -3:], cv2.COLOR_RGB2BGR) cv2.imwrite("eo_obs_segment_buffer.png", converted_image) for step in range(self.nsteps): self.segment.append(np.copy(e0_obs[step]), np.copy(e0_rew[step])) if len(self.segment) == 40 or e0_dones[step]: while len(self.segment) < 40: # Pad to 25 steps long so that all segments in the batch # have the same length. # Note that the reward predictor needs the full frame # stack, so we send all frames. self.segment.append(e0_obs[step], 0) self.segment.finalise() try: self.seg_pipe.put(self.segment, block=False) except queue.Full: # If the preference interface has a backlog of segments # to deal with, don't stop training the agents. Just drop # the segment and keep on going. pass self.segment = Segment() def update_episode_frame_buffer(self, mb_obs, mb_dones): e0_obs = mb_obs[0] e0_dones = mb_dones[0] for step in range(self.nsteps): # Here we only need to send the last frame (the most recent one) # from the 4-frame stack, because we're just showing output to # the user. # TODO make general for num_channels self.episode_frames.append(e0_obs[step, :, :, -3]) if e0_dones[step]: self.episode_vid_queue.put(self.episode_frames) self.episode_frames = [] def run(self): nenvs = len(self.env.remotes) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = \ [], [], [], [], [] mb_states = self.states # Run for nsteps steps in the environment for _ in range(self.nsteps): actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) # actions here are of shape (1, 11) # IMPORTANT: Here we are adding multiple copies of the # stacked version of obs, and that's what we pass to the update_segment_buffer mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # len({obs, rewards, dones}) == nenvs obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 # SubprocVecEnv automatically resets when done self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) # batch of steps to batch of rollouts # i.e. from nsteps, nenvs to nenvs, nsteps mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] # The first entry was just the init state of 'dones' (all False), # before we'd actually run any steps, so drop it. mb_dones = mb_dones[:, 1:] # Log original rewards for env_n, (rs, dones) in enumerate(zip(mb_rewards, mb_dones)): assert_equal(rs.shape, (self.nsteps, )) assert_equal(dones.shape, (self.nsteps, )) for step_n in range(self.nsteps): self.orig_reward[env_n] += rs[step_n] if dones[step_n]: easy_tf_log.tflog( "orig_reward_{}".format(env_n), self.orig_reward[env_n]) self.orig_reward[env_n] = 0 if self.env.env_id == 'MovingDotNoFrameskip-v0': # For MovingDot, reward depends on both current observation and # current action, so encode action in the observations. # (We only need to set this in the most recent frame, # because that's all that the reward predictor for MovingDot # uses.) mb_obs[:, :, 0, 0, -1] = mb_actions[:, :] # Generate segments # (For MovingDot, this has to happen _after_ we've encoded the action # in the observations.) if self.gen_segments: self.update_segment_buffer(mb_obs, mb_rewards, mb_dones) # Replace rewards with those from reward predictor # (Note that this also needs to be done _after_ we've encoded the # action.) logging.debug("Original rewards:\n%s", mb_rewards) if self.reward_predictor: assert_equal(mb_obs.shape[0], nenvs) assert_equal(mb_obs.shape[1], self.nsteps) # TODO make general to stacking sizes other than 4 assert(mb_obs.shape[-1] % 4 == 0) # TODO make general across num_channels h, w, c = mb_obs.shape[-3:] # TODO figure out what this reshape is doing here and whether it's necessary pre-reward-predictor mb_obs_allenvs = mb_obs.reshape(nenvs * self.nsteps, h, w, c) rewards_allenvs = self.reward_predictor.reward(mb_obs_allenvs) assert_equal(rewards_allenvs.shape, (nenvs * self.nsteps, )) mb_rewards = rewards_allenvs.reshape(nenvs, self.nsteps) assert_equal(mb_rewards.shape, (nenvs, self.nsteps)) logging.debug("Predicted rewards:\n%s", mb_rewards) # Save frames for episode rendering if self.episode_vid_queue is not None: self.update_episode_frame_buffer(mb_obs, mb_dones) # Discount rewards mb_obs = mb_obs.reshape(self.batch_ob_shape) last_values = self.model.value(self.obs, self.states, self.dones).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: # Make sure that the first iteration of the loop inside # discount_with_dones picks up 'value' as the initial # value of r rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards # Well, there's the culprit def flatten_correctly(arr): assert arr.shape[0] == 1 new_shape = arr.shape[1:] return arr.reshape(new_shape) mb_rewards = flatten_correctly(mb_rewards) mb_actions = flatten_correctly(mb_actions) mb_values = flatten_correctly(mb_values) mb_masks = flatten_correctly(mb_masks) return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values