def model_comparison( _seed: int, # pylint:disable=invalid-name # Dataset env_name: str, discount: float, dataset_factory: datasets.TransitionsFactory, dataset_factory_kwargs: Dict[str, Any], # Source specification source_reward_type: str, source_reward_path: str, # Target specification target_reward_type: str, target_reward_path: str, # Model parameters comparison_class: Type[comparisons.RegressModel], comparison_kwargs: Dict[str, Any], affine_size: int, total_timesteps: int, batch_size: int, fit_kwargs: Dict[str, Any], # Logging log_dir: str, ) -> Mapping[str, Any]: """Entry-point into script to regress source onto target reward model.""" with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_generator: def make_source(venv): return serialize.load_reward(source_reward_type, source_reward_path, venv, discount) def make_trainer(model, model_scope, target): del model_scope return comparison_class(model, target, **comparison_kwargs) def do_training(target, trainer): del target return trainer.fit( dataset_generator, total_timesteps=total_timesteps, batch_size=batch_size, affine_size=affine_size, **fit_kwargs, ) return regress_utils.regress( seed=_seed, env_name=env_name, discount=discount, make_source=make_source, source_init=False, make_trainer=make_trainer, do_training=do_training, target_reward_type=target_reward_type, target_reward_path=target_reward_path, log_dir=log_dir, )
def train_regress( _seed: int, # pylint:disable=invalid-name # Dataset env_name: str, discount: float, dataset_factory: datasets.TransitionsFactory, dataset_factory_kwargs: Dict[str, Any], # Target specification target_reward_type: str, target_reward_path: str, # Model parameters model_reward_type: regress_utils.EnvRewardFactory, total_timesteps: int, batch_size: int, learning_rate: float, # Logging checkpoint_interval: int, log_dir: str, ) -> Mapping[str, Any]: """Entry-point into script to regress source onto target reward model.""" with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_generator: make_source = functools.partial(regress_utils.make_model, model_reward_type) def make_trainer(model, model_scope, target): del model_scope return comparisons.RegressModel(model, target, learning_rate=learning_rate) def do_training(target, trainer, callback: Optional[base.Callback]): del target return trainer.fit( dataset_generator, total_timesteps=total_timesteps, batch_size=batch_size, callback=callback, ) return regress_utils.regress( seed=_seed, env_name=env_name, discount=discount, make_source=make_source, source_init=True, make_trainer=make_trainer, do_training=do_training, target_reward_type=target_reward_type, target_reward_path=target_reward_path, log_dir=log_dir, checkpoint_interval=checkpoint_interval, )
def train_preferences( _seed: int, # pylint:disable=invalid-name # Dataset env_name: str, discount: float, num_vec: int, policy_type: str, policy_path: str, # Target specification target_reward_type: str, target_reward_path: str, # Model parameters model_reward_type: regress_utils.EnvRewardFactory, trajectory_length: int, total_timesteps: int, batch_timesteps: int, learning_rate: float, weight_l2_reg: float, reward_l2_reg: float, accuracy_threshold: float, # Logging log_dir: str, checkpoint_interval: int, ) -> Mapping[str, Any]: """Entry-point into script for synthetic preference comparisons.""" venv = util.make_vec_env(env_name, n_envs=num_vec, seed=_seed) make_source = functools.partial(regress_utils.make_model, model_reward_type) def make_trainer(model, model_scope, target): del target model_params = model_scope.global_variables() batch_size = batch_timesteps // trajectory_length kwargs = {"learning_rate": learning_rate} return preferences.PreferenceComparisonTrainer( model, model_params, batch_size=batch_size, optimizer_kwargs=kwargs, weight_l2_reg=weight_l2_reg, reward_l2_reg=reward_l2_reg, accuracy_threshold=accuracy_threshold, ) with policies_serialize.load_policy(policy_type, policy_path, venv) as policy: def do_training(target, trainer, callback: Optional[base.Callback]): # Specify in terms of total_timesteps so longer trajectory_length # does not give model more data. total_comparisons = total_timesteps // trajectory_length return trainer.fit_synthetic( venv, policy=policy, target=target, trajectory_length=trajectory_length, total_comparisons=total_comparisons, callback=callback, ) return regress_utils.regress( seed=_seed, env_name=env_name, discount=discount, make_source=make_source, source_init=True, make_trainer=make_trainer, do_training=do_training, target_reward_type=target_reward_type, target_reward_path=target_reward_path, log_dir=log_dir, checkpoint_interval=checkpoint_interval, )
def npec_worker( seed: int, # Dataset env_name: str, discount: float, visitations_factory, visitations_factory_kwargs: Dict[str, Any], # Models to compare source_reward_cfg: common_config.RewardCfg, target_reward_cfg: common_config.RewardCfg, # Model parameters comparison_class: Type[comparisons.RegressModel], comparison_kwargs: Dict[str, Any], total_timesteps: int, batch_size: int, fit_kwargs: Dict[str, Any], # Logging log_dir: str, ) -> comparisons.FitStats: """Performs a single NPEC comparison by fitting a model. Args: seed: Seed used for visitation factory and model initialization. env_name: the name of the environment to compare rewards for. discount: discount to use for reward models (mostly for shaping). visitations_factory: factory to sample transitions from during training. visitations_factory_kwargs: keyword arguments for the visitations factory. source_reward_cfg: specifies the serialized source reward. target_reward_cfg: specifies the serialized target reward to fit the source onto. comparison_class: how to fit the source onto target. comparison_kwargs: keyword arguments customizing `comparison_class`. total_timesteps: the total number of timesteps to train for. batch_size: the number of timesteps in each training batch. fit_kwargs: extra arguments to pass to the `fit` method of `comparison_class`. log_dir: directory to save data to. Returns: Statistics for training, including the final loss aka estimated NPEC distance. """ # Configure logging, since Ray children do not by default inherit logging configs. script_utils.configure_logging() with visitations_factory(seed=seed, **visitations_factory_kwargs) as dataset_generator: def make_source(venv): kind, path = source_reward_cfg return serialize.load_reward(kind, path, venv, discount) def make_trainer(model, model_scope, target): del model_scope return comparison_class(model, target, **comparison_kwargs) def do_training(target, trainer, callback): del target return trainer.fit( dataset_generator, total_timesteps=total_timesteps, batch_size=batch_size, callback=callback, **fit_kwargs, ) target_reward_type, target_reward_path = target_reward_cfg return regress_utils.regress( seed=seed, env_name=env_name, discount=discount, make_source=make_source, source_init=False, make_trainer=make_trainer, do_training=do_training, target_reward_type=target_reward_type, target_reward_path=target_reward_path, log_dir=log_dir, checkpoint_interval=0, # disable checkpoints )