def _run_pref_interface(pref_interface: PrefInterface, seg_pipe: mp.Queue, pref_pipe: mp.Queue, kill_processes: mp.Value, log_level: int = logging.INFO): """ Basically a large lambda function for calling pref_interface.run(); meant to be used as the target of a multiprocessing Process. :param pref_interface: The PrefInterface object you want to run :param seg_pipe: A multiprocessing Queue in which the env will add new segments for the PrefInterface to pair and request preferences for :param pref_pipe: A multiprocessing Queue for the PrefInterface to add preferences once collected, which will make them accessible to the PrefDB in which they are stored and used for reward predictor training :param remaining_pairs: A multiprocessing Value that the PrefInterface can use to keep track of the remaining pairs of segments it has to get preferences for, so that information is accessible externally :param kill_processes: A multiprocessing Value that will be set to 1 if we want to terminate running processes (specifically, it will trigger pref_interface.run() to return so we can easily join the process) """ pref_interface.run(seg_pipe=seg_pipe, pref_pipe=pref_pipe, kill_processes=kill_processes, log_level=log_level)
def test_recv_segments(self): """ Check that segments are stored correctly in the circular buffer. """ pi = PrefInterface(synthetic_prefs=True, max_segs=5, log_dir='/tmp') pipe = Queue() for i in range(5): pipe.put(i) pi.recv_segments(pipe) np.testing.assert_array_equal(pi.segments, [0, 1, 2, 3, 4]) for i in range(5, 8): pipe.put(i) pi.recv_segments(pipe) np.testing.assert_array_equal(pi.segments, [5, 6, 7, 3, 4]) for i in range(8, 11): pipe.put(i) pi.recv_segments(pipe) np.testing.assert_array_equal(pi.segments, [10, 6, 7, 8, 9])
def start_pref_interface(seg_pipe, pref_pipe, max_segs, synthetic_prefs, log_dir, zoom, channels): def f(): # The preference interface needs to get input from stdin. stdin is # automatically closed at the beginning of child processes in Python, # so this is a bit of a hack, but it seems to be fine. sys.stdin = os.fdopen(0) pi.run(seg_pipe=seg_pipe, pref_pipe=pref_pipe) # Needs to be done in the main process because does GUI setup work prefs_log_dir = osp.join(log_dir, 'pref_interface') pi = PrefInterface(synthetic_prefs=synthetic_prefs, max_segs=max_segs, log_dir=prefs_log_dir, channels=channels, zoom=zoom) print("Preference interface has been created") proc = Process(target=f, daemon=True) proc.start() return pi, proc
def __init__(self, env: Env, reward_predictor_network: Callable = net_cnn, train_reward: bool = True, collect_prefs: bool = True, segment_length: int = 40, mp_context: str = 'spawn', prefs_dir: str = None, log_dir: str = "drlhp_logs/", max_prefs_in_db: int = 10000, obs_transform_func: Callable = None, n_initial_training_steps: int = 50, n_initial_prefs: int = 40, pretrained_reward_predictor_dir: str = None, reward_predictor_ckpt_interval: int = 10, reward_predictor_refresh_interval: int = 10, validation_interval: int = 10, reward_database_refresh_interval: int = 1, synthetic_prefs: bool = True, max_pref_interface_segs: int = 25, zoom_ratio: int = 4, channels: int = 3, env_wrapper_log_level: int = logging.INFO, reward_predictor_log_level: int = logging.INFO, pref_interface_log_level: int = logging.INFO ): """ A Wrapper that collects segments from the observations returned through its internal env's .step() function, and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences. It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as training examples. When a minimum number of training steps has been reached, it loads the trained reward predictor network and starts using that as the returned reward, rather than underlying environment reward :param env: Underlying environment :param reward_predictor_network: Callable mapping between input obs and reward scalar :param train_reward: A boolean specifying whether or not the env should train a reward predictor :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist :param log_dir: An string path specifying where logs and artifacts from this run should be saved :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs :param obs_transform_func: An optional transformation function to transform the observation returned by our internal environment into the observation that should be concatenated to form our segments (for example, if the underlying environment is a Dict space, your transform func could be obs['pov']) :param n_initial_training_steps: How many training steps should be performed before we switch to using a trained reward model as our returned environment reward :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor is saved, if one exists :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the reward predictor used by the env to calculate reward :param validation_interval: Interval of reward predictor training steps on which to perform validation :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the PrefDBs used for training/validation :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically checkpoint the reward prediction model :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False, we query for human preferences using a GUI interface :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by the preference interface :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using synthetic preferences) :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from observation space shape because common usage involves a FrameStack wrapper, which will stack frames along the channel dimension) :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function :param pref_interface_log_level: The log level of the logger used by the preference interface """ # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic super(HumanPreferencesEnvWrapper, self).__init__(env) self.logger = logging.getLogger("HumanPreferencesEnvWrapper") self.logger.setLevel(env_wrapper_log_level) self.reward_predictor_log_level = reward_predictor_log_level self.pref_interface_log_level = pref_interface_log_level self.obs_shape = env.observation_space.shape self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs, max_segs=max_pref_interface_segs, log_dir=log_dir, channels=channels, zoom=zoom_ratio) # Save a bunch of init parameters as wrapper properties self.synthetic_prefs = synthetic_prefs self.mp_context = mp_context self.train_reward = train_reward self.collect_prefs = collect_prefs self.segment_length = segment_length self.reward_predictor_network = reward_predictor_network self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir self.obs_transform_func = obs_transform_func self.prefs_dir = prefs_dir self.max_prefs = max_prefs_in_db self.n_initial_prefs = n_initial_prefs self.n_initial_training_steps = n_initial_training_steps self.log_dir = log_dir self.ckpt_interval = reward_predictor_ckpt_interval self.reward_predictor_refresh_interval = reward_predictor_refresh_interval self.val_interval = validation_interval self.reward_database_refresh_interval = reward_database_refresh_interval # Setting counter and status variables to initial values self.segments_collected = 0 self.reward_predictor_n_train = 0 self.using_reward_from_predictor = False self.force_return_true_reward = False self.collecting_segments = True self.last_true_reward = None # Create empty observation stack and new segment self.recent_obs_stack = [] self.episode_segment = Segment() self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints') # Create Queues and Values to handle multiprocessing communication # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the # TODO ground due to timing issues self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5) self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1) self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0) self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0) self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0) self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0) self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0) # Create placeholder parameters for things that we'll initialize later self.pref_interface_proc = None self.reward_training_proc = None self.pref_buffer = None self.reward_predictor = None # If we want to collect preferences, we need to start a PrefInterface-running process if self.collect_prefs: self._start_pref_interface() # If we want to save preferences and/or train a reward model, we need to start a reward predictor training # process (which also handles creating a PrefDB in which preferences are stored/saved) if self.train_reward or self.collect_prefs: self._start_reward_predictor_training()
def setUp(self): self.p = PrefInterface(synthetic_prefs=True, max_segs=1000, log_dir='/tmp') termcolor.cprint(self._testMethodName, 'red')