예제 #1
0
def _run_pref_interface(pref_interface: PrefInterface,
                        seg_pipe: mp.Queue,
                        pref_pipe: mp.Queue,
                        kill_processes: mp.Value,
                        log_level: int = logging.INFO):
    """
    Basically a large lambda function for calling pref_interface.run(); meant to be used as the target of a
    multiprocessing Process.

    :param pref_interface: The PrefInterface object you want to run
    :param seg_pipe: A multiprocessing Queue in which the env will add new segments for the PrefInterface to pair and
                     request preferences for
    :param pref_pipe: A multiprocessing Queue for the PrefInterface to add preferences once collected, which will make
                      them accessible to the PrefDB in which they are stored and used for reward predictor training
    :param remaining_pairs: A multiprocessing Value that the PrefInterface can use to keep track of the remaining pairs
                            of segments it has to get preferences for, so that information is accessible externally
    :param kill_processes: A multiprocessing Value that will be set to 1 if we want to terminate running processes
                           (specifically, it will trigger pref_interface.run() to return so we can easily join
                           the process)
    """
    pref_interface.run(seg_pipe=seg_pipe,
                       pref_pipe=pref_pipe,
                       kill_processes=kill_processes,
                       log_level=log_level)
예제 #2
0
 def test_recv_segments(self):
     """
     Check that segments are stored correctly in the circular buffer.
     """
     pi = PrefInterface(synthetic_prefs=True, max_segs=5, log_dir='/tmp')
     pipe = Queue()
     for i in range(5):
         pipe.put(i)
         pi.recv_segments(pipe)
     np.testing.assert_array_equal(pi.segments, [0, 1, 2, 3, 4])
     for i in range(5, 8):
         pipe.put(i)
         pi.recv_segments(pipe)
     np.testing.assert_array_equal(pi.segments, [5, 6, 7, 3, 4])
     for i in range(8, 11):
         pipe.put(i)
         pi.recv_segments(pipe)
     np.testing.assert_array_equal(pi.segments, [10, 6, 7, 8, 9])
예제 #3
0
def start_pref_interface(seg_pipe, pref_pipe, max_segs, synthetic_prefs,
                         log_dir, zoom, channels):
    def f():
        # The preference interface needs to get input from stdin. stdin is
        # automatically closed at the beginning of child processes in Python,
        # so this is a bit of a hack, but it seems to be fine.
        sys.stdin = os.fdopen(0)
        pi.run(seg_pipe=seg_pipe, pref_pipe=pref_pipe)

    # Needs to be done in the main process because does GUI setup work
    prefs_log_dir = osp.join(log_dir, 'pref_interface')
    pi = PrefInterface(synthetic_prefs=synthetic_prefs,
                       max_segs=max_segs,
                       log_dir=prefs_log_dir,
                       channels=channels,
                       zoom=zoom)
    print("Preference interface has been created")
    proc = Process(target=f, daemon=True)
    proc.start()
    return pi, proc
예제 #4
0
    def __init__(self,
                 env: Env,
                 reward_predictor_network: Callable = net_cnn,
                 train_reward: bool = True,
                 collect_prefs: bool = True,
                 segment_length: int = 40,
                 mp_context: str = 'spawn',
                 prefs_dir: str = None,
                 log_dir: str = "drlhp_logs/",
                 max_prefs_in_db: int = 10000,
                 obs_transform_func: Callable = None,
                 n_initial_training_steps: int = 50,
                 n_initial_prefs: int = 40,
                 pretrained_reward_predictor_dir: str = None,
                 reward_predictor_ckpt_interval: int = 10,
                 reward_predictor_refresh_interval: int = 10,
                 validation_interval: int = 10,
                 reward_database_refresh_interval: int = 1,
                 synthetic_prefs: bool = True,
                 max_pref_interface_segs: int = 25,
                 zoom_ratio: int = 4,
                 channels: int = 3,
                 env_wrapper_log_level: int = logging.INFO,
                 reward_predictor_log_level: int = logging.INFO,
                 pref_interface_log_level: int = logging.INFO
                 ):
        """
        A Wrapper that collects segments from the observations returned through its internal env's .step() function,
        and sends them to a PrefInterface that queries either humans or a synthetic reward oracle for preferences.

        It also manages creating and training a reward prediction network, using preferences stored in a PrefDB as
        training examples. When a minimum number of training steps has been reached, it loads the trained reward
        predictor network and starts using that as the returned reward, rather than underlying environment reward

        :param env: Underlying environment
        :param reward_predictor_network: Callable mapping between input obs and reward scalar
        :param train_reward: A boolean specifying whether or not the env should train a reward predictor
        :param collect_prefs: A boolean specifying whether or not the env should collect preferences in a PrefDB
        :param segment_length: How many observations long a segment should be before it's sent to the PrefInterface
        :param mp_context: A string specifying the multiprocessing context we want to use for this env's processes
        :param prefs_dir: An string path specifying where an existing set of PrefDBs are stored, if any exist
        :param log_dir: An string path specifying where logs and artifacts from this run should be saved
        :param max_prefs_in_db: The maximum number of preferences to store across both train and validation PrefDBs
        :param obs_transform_func: An optional transformation function to transform the observation returned by our
                                    internal environment into the observation that should be concatenated to form our
                                    segments (for example, if the underlying environment is a Dict space, your transform
                                    func could be obs['pov'])
        :param n_initial_training_steps: How many training steps should be performed before we switch to using a
                                        trained reward model as our returned environment reward
        :param n_initial_prefs: How many preferences to collect before starting to train our reward predictor


        :param pretrained_reward_predictor_dir: An string path specifying where a pretrained reward predictor
                                                is saved, if one exists

        :param reward_predictor_refresh_interval: Interval of reward predictor training steps on which to update the
                                                  reward predictor used by the env to calculate reward
        :param validation_interval: Interval of reward predictor training steps on which to perform validation
        :param reward_database_refresh_interval: Interval of reward predictor training steps on which to refresh the
                                                 PrefDBs used for training/validation

        :param reward_predictor_ckpt_interval: The interval of reward training steps on which we should automatically
                                               checkpoint the reward prediction model

        :param synthetic_prefs: If True, we use the reward function of the environment to calculate prefs; if False,
                                we query for human preferences using a GUI interface


        :param max_pref_interface_segs: The maximum number of segments that will be stored and paired with one another by
                                        the preference interface
        :param zoom_ratio: How much images should be zoomed when they're displayed to humans in the GUI (ignored if using
                            synthetic preferences)
        :param channels: The number of channels the images you'll show to humans will have. (Can't be inferred from
                         observation space shape because common usage involves a FrameStack wrapper, which will stack
                         frames along the channel dimension)
        :param env_wrapper_log_level: The log level of the logger corresponding to the wrapper as a whole
        :param reward_predictor_log_level: The log level of the logger corresponding to the reward predictor training function
        :param pref_interface_log_level: The log level of the logger used by the preference interface
        """

        # Recommend using 'spawn' for non synthetic preferences and 'fork' for synthetic
        super(HumanPreferencesEnvWrapper, self).__init__(env)
        self.logger = logging.getLogger("HumanPreferencesEnvWrapper")
        self.logger.setLevel(env_wrapper_log_level)
        self.reward_predictor_log_level = reward_predictor_log_level
        self.pref_interface_log_level = pref_interface_log_level

        self.obs_shape = env.observation_space.shape

        self.preference_interface = PrefInterface(synthetic_prefs=synthetic_prefs,
                                                  max_segs=max_pref_interface_segs,
                                                  log_dir=log_dir,
                                                  channels=channels,
                                                  zoom=zoom_ratio)

        # Save a bunch of init parameters as wrapper properties
        self.synthetic_prefs = synthetic_prefs
        self.mp_context = mp_context
        self.train_reward = train_reward
        self.collect_prefs = collect_prefs
        self.segment_length = segment_length
        self.reward_predictor_network = reward_predictor_network
        self.pretrained_reward_predictor_dir = pretrained_reward_predictor_dir
        self.obs_transform_func = obs_transform_func
        self.prefs_dir = prefs_dir
        self.max_prefs = max_prefs_in_db
        self.n_initial_prefs = n_initial_prefs
        self.n_initial_training_steps = n_initial_training_steps
        self.log_dir = log_dir
        self.ckpt_interval = reward_predictor_ckpt_interval
        self.reward_predictor_refresh_interval = reward_predictor_refresh_interval
        self.val_interval = validation_interval
        self.reward_database_refresh_interval = reward_database_refresh_interval


        # Setting counter and status variables to initial values
        self.segments_collected = 0
        self.reward_predictor_n_train = 0
        self.using_reward_from_predictor = False
        self.force_return_true_reward = False
        self.collecting_segments = True
        self.last_true_reward = None

        # Create empty observation stack and new segment
        self.recent_obs_stack = []
        self.episode_segment = Segment()
        self.reward_predictor_checkpoint_dir = os.path.join(log_dir, 'reward_predictor_checkpoints')

        # Create Queues and Values to handle multiprocessing communication
        # TODO figure out how to make the mechanics of this work with larger Queues, so we don't drop segments on the
        # TODO ground due to timing issues
        self.seg_pipe = mp.get_context(self.mp_context).Queue(maxsize=5)
        self.pref_pipe = mp.get_context(self.mp_context).Queue(maxsize=1)
        self.pref_db_size = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_pref_interface_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.kill_reward_training_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_model_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.save_prefs_flag = mp.get_context(self.mp_context).Value('i', 0)
        self.reward_training_steps = mp.get_context(self.mp_context).Value('i', 0)

        # Create placeholder parameters for things that we'll initialize later
        self.pref_interface_proc = None
        self.reward_training_proc = None
        self.pref_buffer = None
        self.reward_predictor = None

        # If we want to collect preferences, we need to start a PrefInterface-running process
        if self.collect_prefs:
            self._start_pref_interface()
        # If we want to save preferences and/or train a reward model, we need to start a reward predictor training
        # process (which also handles creating a PrefDB in which preferences are stored/saved)
        if self.train_reward or self.collect_prefs:
            self._start_reward_predictor_training()
예제 #5
0
 def setUp(self):
     self.p = PrefInterface(synthetic_prefs=True,
                            max_segs=1000,
                            log_dir='/tmp')
     termcolor.cprint(self._testMethodName, 'red')