예제 #1
0
 def update_segment_buffer(self, mb_obs, mb_rewards, mb_dones):
     # Segments are only generated from the first worker.
     # Empirically, this seems to work fine.
     e0_obs = mb_obs[0]
     e0_rew = mb_rewards[0]
     e0_dones = mb_dones[0]
     assert_equal(e0_obs.shape[0], self.nsteps)
     # TODO make this general to nstack parameter
     assert(e0_obs.shape[-1] % 4 == 0)
     assert_equal(e0_rew.shape[0], self.nsteps)
     assert_equal(e0_dones.shape[0], self.nsteps)
     # TODO generalize across num_channels
     converted_image = cv2.cvtColor(e0_obs[0][:, :, -3:], cv2.COLOR_RGB2BGR)
     cv2.imwrite("eo_obs_segment_buffer.png", converted_image)
     for step in range(self.nsteps):
         self.segment.append(np.copy(e0_obs[step]), np.copy(e0_rew[step]))
         if len(self.segment) == 40 or e0_dones[step]:
             while len(self.segment) < 40:
                 # Pad to 25 steps long so that all segments in the batch
                 # have the same length.
                 # Note that the reward predictor needs the full frame
                 # stack, so we send all frames.
                 self.segment.append(e0_obs[step], 0)
             self.segment.finalise()
             try:
                 self.seg_pipe.put(self.segment, block=False)
             except queue.Full:
                 # If the preference interface has a backlog of segments
                 # to deal with, don't stop training the agents. Just drop
                 # the segment and keep on going.
                 pass
             self.segment = Segment()
예제 #2
0
    def _update_episode_segment(self, obs, reward, done):
        """
        Takes observation from most recent environment step and adds it to existing segment. If segment has reached
        desired length, finalize it and send it to the PrefInterface via seg_pipe

        :param obs: A (possibly stacked) observation from the underlying environment
        :param reward: Underlying environment reward (used for synthetic preferences)
        :param done: Whether the episode has terminated, in which case we should pad the rest of the segment and
                    then start a new one
        :return:
        """
        if self.obs_transform_func is not None:
            obs = self.obs_transform_func(obs)
        self.episode_segment.append(np.copy(obs), np.copy(reward))
        if done:
            while len(self.episode_segment) < self.segment_length:
                self.episode_segment.append(np.copy(obs), 0)

        if len(self.episode_segment) == self.segment_length:
            self.segments_collected += 1
            self.episode_segment.finalise()
            try:
                self.seg_pipe.put(self.episode_segment, block=False)
            except queue.Full:
                # If the preference interface has a backlog of segments
                # to deal with, don't stop training the agents. Just drop
                # the segment and keep on going.
                pass
            self.episode_segment = Segment()
예제 #3
0
    def __init__(self,
                 env,
                 model,
                 nsteps,
                 nstack,
                 gamma,
                 gen_segments,
                 seg_pipe,
                 reward_predictor,
                 episode_vid_queue):
        self.env = env
        self.model = model
        nh, nw, nc = env.observation_space.shape
        nenv = env.num_envs

        # CHANGE: In A2C, this is defined as being of shape
        # (n_env*n_steps, nh, nw, nc)
        # Assuming that env.observation_space.shape = (nh, nw, nc)
        self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)

        # CHANGE: In A2C, this is defined as being of shape
        # (n__env, nh, nw, nc) According to the same observation space assumption
        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
        # The first stack of 4 frames: the first 3 frames are zeros,
        # with the last frame coming from env.reset().
        print("Got to before reset")
        print("Shape of self.obs: {}".format(self.obs.shape))
        obs = env.reset()
        print("Finished env reset")
        self.update_obs(obs)
        print("Finished updating obs")
        self.gamma = gamma
        self.nsteps = nsteps
        self.states = model.initial_state
        self.dones = [False for _ in range(nenv)]

        self.gen_segments = gen_segments
        self.segment = Segment()
        self.seg_pipe = seg_pipe

        self.orig_reward = [0 for _ in range(nenv)]
        self.reward_predictor = reward_predictor

        self.episode_frames = []
        self.episode_vid_queue = episode_vid_queue
        print("Got to end of Runner creation")
예제 #4
0
def send_segments(n_segments, seg_pipe):
    frame_stack = np.zeros((84, 84, 4))
    for i in range(n_segments):
        segment = Segment()
        for _ in range(25):
            segment.append(frame=frame_stack, reward=0)
        segment.finalise(seg_id=i)
        seg_pipe.put(segment)
예제 #5
0
 def start_segment_collection(self):
     self.collecting_segments = True
     self.episode_segment = Segment()
예제 #6
0
    def __init__(self,
                 env: Env,
                 reward_predictor_network: Callable = net_cnn,
                 train_reward: bool = True,
                 collect_prefs: bool = True,
                 segment_length: int = 40,
                 mp_context: str = 'spawn',
                 prefs_dir: str = None,
                 log_dir: str = "drlhp_logs/",
                 max_prefs_in_db: int = 10000,
                 obs_transform_func: Callable = None,
                 n_initial_training_steps: int = 50,
                 n_initial_prefs: int = 40,
                 pretrained_reward_predictor_dir: str = None,
                 reward_predictor_ckpt_interval: int = 10,
                 reward_predictor_refresh_interval: int = 10,
                 validation_interval: int = 10,
                 reward_database_refresh_interval: int = 1,
                 synthetic_prefs: bool = True,
                 max_pref_interface_segs: int = 25,
                 zoom_ratio: int = 4,
                 channels: int = 3,
                 env_wrapper_log_level: int = logging.INFO,
                 reward_predictor_log_level: int = logging.INFO,
                 pref_interface_log_level: int = logging.INFO
                 ):
        """
        A Wrapper that collects segments from the observations returned through its internal env's .step() function,
        and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences.

        It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as
        training examples. When a minimum number of training steps has been reached, it loads the trained reward
        predictor network and starts using that as the returned reward, rather than underlying environment reward

        :param env: Underlying environment
        :param reward_predictor_network: Callable mapping between input obs and reward scalar
        :param train_reward: A boolean specifying whether or not the env should train a reward predictor
        :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB
        :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface
        :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes
        :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist
        :param log_dir: An string path specifying where logs and artifacts from this run should be saved
        :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs
        :param obs_transform_func: An optional transformation function to transform the observation returned by our
                                    internal environment into the observation that should be concatenated to form our
                                    segments (for example, if the underlying environment is a Dict space, your transform
                                    func could be obs['pov'])
        :param n_initial_training_steps: How many training steps should be performed before we switch to using a
                                        trained reward model as our returned environment reward
        :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor


        :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor
                                                is saved, if one exists

        :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the
                                                  reward predictor used by the env to calculate reward
        :param validation_interval: Interval of reward predictor training steps on which to perform validation
        :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the
                                                 PrefDBs used for training/validation

        :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically
                                               checkpoint the reward prediction model

        :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False,
                                we query for human preferences using a GUI interface


        :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by
                                        the preference interface
        :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using
                            synthetic preferences)
        :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from
                         observation space shape because common usage involves a FrameStack wrapper, which will stack
                         frames along the channel dimension)
        :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole
        :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function
        :param pref_interface_log_level: The log level of the logger used by the preference interface
        """

        # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic
        super(HumanPreferencesEnvWrapper, self).__init__(env)
        self.logger = logging.getLogger("HumanPreferencesEnvWrapper")
        self.logger.setLevel(env_wrapper_log_level)
        self.reward_predictor_log_level = reward_predictor_log_level
        self.pref_interface_log_level = pref_interface_log_level

        self.obs_shape = env.observation_space.shape

        self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs,
                                                  max_segs=max_pref_interface_segs,
                                                  log_dir=log_dir,
                                                  channels=channels,
                                                  zoom=zoom_ratio)

        # Save a bunch of init parameters as wrapper properties
        self.synthetic_prefs = synthetic_prefs
        self.mp_context = mp_context
        self.train_reward = train_reward
        self.collect_prefs = collect_prefs
        self.segment_length = segment_length
        self.reward_predictor_network = reward_predictor_network
        self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir
        self.obs_transform_func = obs_transform_func
        self.prefs_dir = prefs_dir
        self.max_prefs = max_prefs_in_db
        self.n_initial_prefs = n_initial_prefs
        self.n_initial_training_steps = n_initial_training_steps
        self.log_dir = log_dir
        self.ckpt_interval = reward_predictor_ckpt_interval
        self.reward_predictor_refresh_interval = reward_predictor_refresh_interval
        self.val_interval = validation_interval
        self.reward_database_refresh_interval = reward_database_refresh_interval


        # Setting counter and status variables to initial values
        self.segments_collected = 0
        self.reward_predictor_n_train = 0
        self.using_reward_from_predictor = False
        self.force_return_true_reward = False
        self.collecting_segments = True
        self.last_true_reward = None

        # Create empty observation stack and new segment
        self.recent_obs_stack = []
        self.episode_segment = Segment()
        self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints')

        # Create Queues and Values to handle multiprocessing communication
        # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the
        # TODO ground due to timing issues
        self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5)
        self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1)
        self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0)

        # Create placeholder parameters for things that we'll initialize later
        self.pref_interface_proc = None
        self.reward_training_proc = None
        self.pref_buffer = None
        self.reward_predictor = None

        # If we want to collect preferences, we need to start a PrefInterface-running process
        if self.collect_prefs:
            self._start_pref_interface()
        # If we want to save preferences and/or train a reward model, we need to start a reward predictor training
        # process (which also handles creating a PrefDB in which preferences are stored/saved)
        if self.train_reward or self.collect_prefs:
            self._start_reward_predictor_training()
예제 #7
0
class HumanPreferencesEnvWrapper(Wrapper):
    def __init__(self,
                 env: Env,
                 reward_predictor_network: Callable = net_cnn,
                 train_reward: bool = True,
                 collect_prefs: bool = True,
                 segment_length: int = 40,
                 mp_context: str = 'spawn',
                 prefs_dir: str = None,
                 log_dir: str = "drlhp_logs/",
                 max_prefs_in_db: int = 10000,
                 obs_transform_func: Callable = None,
                 n_initial_training_steps: int = 50,
                 n_initial_prefs: int = 40,
                 pretrained_reward_predictor_dir: str = None,
                 reward_predictor_ckpt_interval: int = 10,
                 reward_predictor_refresh_interval: int = 10,
                 validation_interval: int = 10,
                 reward_database_refresh_interval: int = 1,
                 synthetic_prefs: bool = True,
                 max_pref_interface_segs: int = 25,
                 zoom_ratio: int = 4,
                 channels: int = 3,
                 env_wrapper_log_level: int = logging.INFO,
                 reward_predictor_log_level: int = logging.INFO,
                 pref_interface_log_level: int = logging.INFO
                 ):
        """
        A Wrapper that collects segments from the observations returned through its internal env's .step() function,
        and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences.

        It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as
        training examples. When a minimum number of training steps has been reached, it loads the trained reward
        predictor network and starts using that as the returned reward, rather than underlying environment reward

        :param env: Underlying environment
        :param reward_predictor_network: Callable mapping between input obs and reward scalar
        :param train_reward: A boolean specifying whether or not the env should train a reward predictor
        :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB
        :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface
        :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes
        :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist
        :param log_dir: An string path specifying where logs and artifacts from this run should be saved
        :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs
        :param obs_transform_func: An optional transformation function to transform the observation returned by our
                                    internal environment into the observation that should be concatenated to form our
                                    segments (for example, if the underlying environment is a Dict space, your transform
                                    func could be obs['pov'])
        :param n_initial_training_steps: How many training steps should be performed before we switch to using a
                                        trained reward model as our returned environment reward
        :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor


        :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor
                                                is saved, if one exists

        :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the
                                                  reward predictor used by the env to calculate reward
        :param validation_interval: Interval of reward predictor training steps on which to perform validation
        :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the
                                                 PrefDBs used for training/validation

        :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically
                                               checkpoint the reward prediction model

        :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False,
                                we query for human preferences using a GUI interface


        :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by
                                        the preference interface
        :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using
                            synthetic preferences)
        :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from
                         observation space shape because common usage involves a FrameStack wrapper, which will stack
                         frames along the channel dimension)
        :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole
        :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function
        :param pref_interface_log_level: The log level of the logger used by the preference interface
        """

        # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic
        super(HumanPreferencesEnvWrapper, self).__init__(env)
        self.logger = logging.getLogger("HumanPreferencesEnvWrapper")
        self.logger.setLevel(env_wrapper_log_level)
        self.reward_predictor_log_level = reward_predictor_log_level
        self.pref_interface_log_level = pref_interface_log_level

        self.obs_shape = env.observation_space.shape

        self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs,
                                                  max_segs=max_pref_interface_segs,
                                                  log_dir=log_dir,
                                                  channels=channels,
                                                  zoom=zoom_ratio)

        # Save a bunch of init parameters as wrapper properties
        self.synthetic_prefs = synthetic_prefs
        self.mp_context = mp_context
        self.train_reward = train_reward
        self.collect_prefs = collect_prefs
        self.segment_length = segment_length
        self.reward_predictor_network = reward_predictor_network
        self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir
        self.obs_transform_func = obs_transform_func
        self.prefs_dir = prefs_dir
        self.max_prefs = max_prefs_in_db
        self.n_initial_prefs = n_initial_prefs
        self.n_initial_training_steps = n_initial_training_steps
        self.log_dir = log_dir
        self.ckpt_interval = reward_predictor_ckpt_interval
        self.reward_predictor_refresh_interval = reward_predictor_refresh_interval
        self.val_interval = validation_interval
        self.reward_database_refresh_interval = reward_database_refresh_interval


        # Setting counter and status variables to initial values
        self.segments_collected = 0
        self.reward_predictor_n_train = 0
        self.using_reward_from_predictor = False
        self.force_return_true_reward = False
        self.collecting_segments = True
        self.last_true_reward = None

        # Create empty observation stack and new segment
        self.recent_obs_stack = []
        self.episode_segment = Segment()
        self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints')

        # Create Queues and Values to handle multiprocessing communication
        # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the
        # TODO ground due to timing issues
        self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5)
        self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1)
        self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0)

        # Create placeholder parameters for things that we'll initialize later
        self.pref_interface_proc = None
        self.reward_training_proc = None
        self.pref_buffer = None
        self.reward_predictor = None

        # If we want to collect preferences, we need to start a PrefInterface-running process
        if self.collect_prefs:
            self._start_pref_interface()
        # If we want to save preferences and/or train a reward model, we need to start a reward predictor training
        # process (which also handles creating a PrefDB in which preferences are stored/saved)
        if self.train_reward or self.collect_prefs:
            self._start_reward_predictor_training()

    def _start_pref_interface(self):
        self.pref_interface_proc = mp.get_context(self.mp_context).Process(target=_run_pref_interface, daemon=True,
                                                                           args=(self.preference_interface,
                                                                                 self.seg_pipe,
                                                                                 self.pref_pipe,
                                                                                 self.kill_pref_interface_flag,
                                                                                 self.pref_interface_log_level))
        self.pref_interface_proc.start()

    def _start_reward_predictor_training(self):
        self.reward_training_proc = mp.get_context('spawn').Process(target=_train_reward_predictor, daemon=True,
                                                                    args=(self.reward_predictor_network,
                                                                         self.train_reward,
                                                                         self.pretrained_reward_predictor_dir,
                                                                         self.obs_shape,
                                                                         self.pref_pipe,
                                                                         self.pref_db_size,
                                                                         self.prefs_dir,
                                                                         self.max_prefs,
                                                                         self.ckpt_interval,
                                                                         self.n_initial_prefs,
                                                                         self.reward_training_steps,
                                                                         self.reward_database_refresh_interval,
                                                                         self.val_interval,
                                                                         self.kill_reward_training_flag,
                                                                         self.save_prefs_flag,
                                                                         self.save_model_flag,
                                                                         self.log_dir,
                                                                         self.reward_predictor_log_level))
        self.reward_training_proc.start()

    def _update_episode_segment(self, obs, reward, done):
        """
        Takes observation from most recent environment step and adds it to existing segment. If segment has reached
        desired length, finalize it and send it to the PrefInterface via seg_pipe

        :param obs: A (possibly stacked) observation from the underlying environment
        :param reward: Underlying environment reward (used for synthetic preferences)
        :param done: Whether the episode has terminated, in which case we should pad the rest of the segment and
                    then start a new one
        :return:
        """
        if self.obs_transform_func is not None:
            obs = self.obs_transform_func(obs)
        self.episode_segment.append(np.copy(obs), np.copy(reward))
        if done:
            while len(self.episode_segment) < self.segment_length:
                self.episode_segment.append(np.copy(obs), 0)

        if len(self.episode_segment) == self.segment_length:
            self.segments_collected += 1
            self.episode_segment.finalise()
            try:
                self.seg_pipe.put(self.episode_segment, block=False)
            except queue.Full:
                # If the preference interface has a backlog of segments
                # to deal with, don't stop training the agents. Just drop
                # the segment and keep on going.
                pass
            self.episode_segment = Segment()

    def save_prefs(self):
        self.save_prefs_flag.value = 1

    def save_reward_predictor(self):
        self.save_model_flag.value = 1

    def stop_segment_collection(self):
        self.collecting_segments = False

    def start_segment_collection(self):
        self.collecting_segments = True
        self.episode_segment = Segment()

    def _load_reward_predictor(self, model_load_dir):
        if self.reward_predictor is None:
            self.logger.info(f"Loading reward predictor from {model_load_dir}; will use its model reward now")
            self.reward_predictor = RewardPredictorEnsemble(
                core_network=self.reward_predictor_network,
                log_dir=self.log_dir,
                batchnorm=False,
                dropout=0.0,
                lr=7e-4,
                obs_shape=self.obs_shape,
                logger=self.logger)
        self.reward_predictor_n_train = self.reward_training_steps.value

        self.reward_predictor.init_network(model_load_dir)

    def step(self, action):
        # Check whether we have only just hit the point of the model having trained for enough steps

        minimum_training_steps_reached = self.reward_training_steps.value >= self.n_initial_training_steps
        sufficiently_trained = self.reward_predictor is None and minimum_training_steps_reached

        # Check whether we have an existing pretrained model we've not yet loaded in
        pretrained_model = self.reward_predictor is None and self.pretrained_reward_predictor_dir is not None

        # Check whether we should update our existing reward predictor with a new one because we've done enough
        # training steps since we last updated
        should_update_model = minimum_training_steps_reached and (self.reward_training_steps.value - self.reward_predictor_n_train > self.reward_predictor_refresh_interval)

        # If any of these things are true, we load a model in
        if sufficiently_trained or pretrained_model or should_update_model:
            if sufficiently_trained:
                self.logger.info("Model is sufficiently trained, switching to it for reward")
                model_load_dir = self.reward_predictor_checkpoint_dir
            elif should_update_model:
                self.logger.info("Updating model used for env reward")
                model_load_dir = self.reward_predictor_checkpoint_dir
            else:
                model_load_dir = self.pretrained_reward_predictor_dir
                self.logger.info("Loading pretrained model for env reward")
            self._load_reward_predictor(model_load_dir)
            self.using_reward_from_predictor = True
        obs, reward, done, info = self.env.step(action)

        if self.collecting_segments:
            self._update_episode_segment(obs, reward, done)

        if self.reward_predictor is not None and not self.force_return_true_reward:
            # If we have self.force_return_true_reward set, the environment will return the true
            # underlying reward (meant for evaluation purposes)
            predicted_reward = self.reward_predictor.reward(np.array([np.array(obs)]))[0]
            self.last_true_reward = reward
            return obs, predicted_reward, done, info
        else:
            return obs, reward, done, info

    def switch_to_true_reward(self):
        if not self.using_reward_from_predictor:
            raise Warning("Environment has no reward predictor loaded, and is thus returning true reward")
        elif self.force_return_true_reward:
            raise Warning("Environment already returning true reward, no change")
        else:
            self.using_reward_from_predictor = False
            self.force_return_true_reward = True

    def switch_to_predicted_reward(self):
        """
        Note: this only works to undo a prior forcing of true reward
        if a reward model is already loaded, it can't cause a reward model to exist if it isn't present
        """
        if not self.force_return_true_reward:
            raise Warning("Environment already returning predicted reward, no change")
        else:
            self.using_reward_from_predictor = True
            self.force_return_true_reward = False


    def _cleanup_processes(self):
        self.logger.debug("Sending kill flags to processes")
        self.kill_reward_training_flag.value = 1
        self.kill_pref_interface_flag.value = 1

        self.logger.debug("Joining processes that are running")
        if self.reward_training_proc is not None:
            self.reward_training_proc.join()
        if self.pref_interface_proc is not None:
            self.pref_interface_proc.join()

        self.logger.debug("Closing seg pipe")
        self.seg_pipe.close()
        self.seg_pipe.join_thread()
        self.logger.debug("Closing pref pipe")
        self.pref_pipe.close()
        self.pref_pipe.join_thread()

    def close(self):
        self.logger.debug("env.close() was called")
        self._cleanup_processes()
        self.env.close()
예제 #8
0
class Runner(object):
    def __init__(self,
                 env,
                 model,
                 nsteps,
                 nstack,
                 gamma,
                 gen_segments,
                 seg_pipe,
                 reward_predictor,
                 episode_vid_queue):
        self.env = env
        self.model = model
        nh, nw, nc = env.observation_space.shape
        nenv = env.num_envs

        # CHANGE: In A2C, this is defined as being of shape
        # (n_env*n_steps, nh, nw, nc)
        # Assuming that env.observation_space.shape = (nh, nw, nc)
        self.batch_ob_shape = (nenv * nsteps, nh, nw, nc * nstack)

        # CHANGE: In A2C, this is defined as being of shape
        # (n__env, nh, nw, nc) According to the same observation space assumption
        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
        # The first stack of 4 frames: the first 3 frames are zeros,
        # with the last frame coming from env.reset().
        print("Got to before reset")
        print("Shape of self.obs: {}".format(self.obs.shape))
        obs = env.reset()
        print("Finished env reset")
        self.update_obs(obs)
        print("Finished updating obs")
        self.gamma = gamma
        self.nsteps = nsteps
        self.states = model.initial_state
        self.dones = [False for _ in range(nenv)]

        self.gen_segments = gen_segments
        self.segment = Segment()
        self.seg_pipe = seg_pipe

        self.orig_reward = [0 for _ in range(nenv)]
        self.reward_predictor = reward_predictor

        self.episode_frames = []
        self.episode_vid_queue = episode_vid_queue
        print("Got to end of Runner creation")

    def update_obs(self, obs):
        # Do frame-stacking here instead of the FrameStack wrapper to reduce
        # IPC overhead
        # TODO take more general channel values
        self.obs = np.roll(self.obs, shift=-3, axis=3)
        self.obs[:, :, :, -3:] = obs[:, :, :, 0:3]

    def update_segment_buffer(self, mb_obs, mb_rewards, mb_dones):
        # Segments are only generated from the first worker.
        # Empirically, this seems to work fine.
        e0_obs = mb_obs[0]
        e0_rew = mb_rewards[0]
        e0_dones = mb_dones[0]
        assert_equal(e0_obs.shape[0], self.nsteps)
        # TODO make this general to nstack parameter
        assert(e0_obs.shape[-1] % 4 == 0)
        assert_equal(e0_rew.shape[0], self.nsteps)
        assert_equal(e0_dones.shape[0], self.nsteps)
        # TODO generalize across num_channels
        converted_image = cv2.cvtColor(e0_obs[0][:, :, -3:], cv2.COLOR_RGB2BGR)
        cv2.imwrite("eo_obs_segment_buffer.png", converted_image)
        for step in range(self.nsteps):
            self.segment.append(np.copy(e0_obs[step]), np.copy(e0_rew[step]))
            if len(self.segment) == 40 or e0_dones[step]:
                while len(self.segment) < 40:
                    # Pad to 25 steps long so that all segments in the batch
                    # have the same length.
                    # Note that the reward predictor needs the full frame
                    # stack, so we send all frames.
                    self.segment.append(e0_obs[step], 0)
                self.segment.finalise()
                try:
                    self.seg_pipe.put(self.segment, block=False)
                except queue.Full:
                    # If the preference interface has a backlog of segments
                    # to deal with, don't stop training the agents. Just drop
                    # the segment and keep on going.
                    pass
                self.segment = Segment()

    def update_episode_frame_buffer(self, mb_obs, mb_dones):
        e0_obs = mb_obs[0]
        e0_dones = mb_dones[0]
        for step in range(self.nsteps):
            # Here we only need to send the last frame (the most recent one)
            # from the 4-frame stack, because we're just showing output to
            # the user.
            # TODO make general for num_channels
            self.episode_frames.append(e0_obs[step, :, :, -3])
            if e0_dones[step]:
                self.episode_vid_queue.put(self.episode_frames)
                self.episode_frames = []

    def run(self):
        nenvs = len(self.env.remotes)
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = \
            [], [], [], [], []
        mb_states = self.states

        # Run for nsteps steps in the environment
        for _ in range(self.nsteps):
            actions, values, states, _ = self.model.step(self.obs, self.states,
                                                      self.dones)
            # actions here are of shape (1, 11)

            # IMPORTANT: Here we are adding multiple copies of the
            # stacked version of obs, and that's what we pass to the update_segment_buffer
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            # len({obs, rewards, dones}) == nenvs
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            # SubprocVecEnv automatically resets when done
            self.update_obs(obs)
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        # batch of steps to batch of rollouts
        # i.e. from nsteps, nenvs to nenvs, nsteps
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        # The first entry was just the init state of 'dones' (all False),
        # before we'd actually run any steps, so drop it.
        mb_dones = mb_dones[:, 1:]

        # Log original rewards
        for env_n, (rs, dones) in enumerate(zip(mb_rewards, mb_dones)):
            assert_equal(rs.shape, (self.nsteps, ))
            assert_equal(dones.shape, (self.nsteps, ))
            for step_n in range(self.nsteps):
                self.orig_reward[env_n] += rs[step_n]
                if dones[step_n]:
                    easy_tf_log.tflog(
                        "orig_reward_{}".format(env_n),
                        self.orig_reward[env_n])
                    self.orig_reward[env_n] = 0

        if self.env.env_id == 'MovingDotNoFrameskip-v0':
            # For MovingDot, reward depends on both current observation and
            # current action, so encode action in the observations.
            # (We only need to set this in the most recent frame,
            # because that's all that the reward predictor for MovingDot
            # uses.)
            mb_obs[:, :, 0, 0, -1] = mb_actions[:, :]

        # Generate segments
        # (For MovingDot, this has to happen _after_ we've encoded the action
        # in the observations.)
        if self.gen_segments:
            self.update_segment_buffer(mb_obs, mb_rewards, mb_dones)

        # Replace rewards with those from reward predictor
        # (Note that this also needs to be done _after_ we've encoded the
        # action.)
        logging.debug("Original rewards:\n%s", mb_rewards)
        if self.reward_predictor:
            assert_equal(mb_obs.shape[0], nenvs)
            assert_equal(mb_obs.shape[1], self.nsteps)
            # TODO make general to stacking sizes other than 4
            assert(mb_obs.shape[-1] % 4 == 0)
            # TODO make general across num_channels
            h, w, c = mb_obs.shape[-3:]

            # TODO figure out what this reshape is doing here and whether it's necessary pre-reward-predictor
            mb_obs_allenvs = mb_obs.reshape(nenvs * self.nsteps, h, w, c)

            rewards_allenvs = self.reward_predictor.reward(mb_obs_allenvs)
            assert_equal(rewards_allenvs.shape, (nenvs * self.nsteps, ))
            mb_rewards = rewards_allenvs.reshape(nenvs, self.nsteps)
            assert_equal(mb_rewards.shape, (nenvs, self.nsteps))

            logging.debug("Predicted rewards:\n%s", mb_rewards)

        # Save frames for episode rendering
        if self.episode_vid_queue is not None:
            self.update_episode_frame_buffer(mb_obs, mb_dones)

        # Discount rewards
        mb_obs = mb_obs.reshape(self.batch_ob_shape)
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        # discount/bootstrap off value fn
        for n, (rewards, dones, value) in enumerate(
                zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                # Make sure that the first iteration of the loop inside
                # discount_with_dones picks up 'value' as the initial
                # value of r
                rewards = discount_with_dones(rewards + [value],
                                              dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards
        # Well, there's the culprit

        def flatten_correctly(arr):
            assert arr.shape[0] == 1
            new_shape = arr.shape[1:]
            return arr.reshape(new_shape)

        mb_rewards = flatten_correctly(mb_rewards)
        mb_actions = flatten_correctly(mb_actions)
        mb_values = flatten_correctly(mb_values)
        mb_masks = flatten_correctly(mb_masks)

        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values