def f(make_model): policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space) with datasets.transitions_factory_from_policy(venv, policy) as dataset_callable: batch = dataset_callable(1024) with graph.as_default(), session.as_default(): original = make_model(venv) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory(prefix="eval-rew-serialize") as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded_direct"): loaded_direct = util_serialize.Serializable.load(tmpdir) model_name = "evaluating_rewards/RewardModel-v0" loaded_indirect = serialize.load_reward(model_name, tmpdir, venv) models = {"o": original, "ld": loaded_direct, "li": loaded_indirect} preds = base.evaluate_models(models, batch) for model in models.values(): assert original.observation_space == model.observation_space assert original.action_space == model.action_space assert len(preds) == len(models) for pred in preds.values(): assert np.allclose(preds["o"], pred)
def plot_pm_reward( styles: Iterable[str], env_name: str, discount: float, models: Sequence[Tuple[str, str, str]], data_root: str, # Mesh parameters pos_lim: float, pos_density: int, vel_lim: float, act_lim: float, density: int, # Figure parameters ncols: int, cbar_kwargs: Mapping[str, Any], log_dir: str, fmt: str, ) -> xr.DataArray: """Entry-point into script to visualize a reward model for point mass.""" with stylesheets.setup_styles(styles): env = gym.make(env_name) venv = vec_env.DummyVecEnv([lambda: env]) goal = np.array([0.0]) rewards = {} with networks.make_session(): for model_name, reward_type, reward_path in models: reward_path = os.path.join(data_root, reward_path) model = serialize.load_reward(reward_type, reward_path, venv, discount) reward = point_mass_analysis.evaluate_reward_model( env, model, goal=goal, pos_lim=pos_lim, pos_density=pos_density, vel_lim=vel_lim, act_lim=act_lim, density=density, ) rewards[model_name] = reward if len(rewards) == 1: reward = next(iter(rewards.values())) kwargs = {"col_wrap": ncols} else: reward = xr.Dataset(rewards).to_array("model") kwargs = {"row": "Model"} fig = point_mass_analysis.plot_reward(reward, cbar_kwargs=cbar_kwargs, **kwargs) save_path = os.path.join(log_dir, "reward") visualize.save_fig(save_path, fig, fmt=fmt) return reward
def regress( seed: int, # Dataset env_name: str, discount: float, # Target specification target_reward_type: str, target_reward_path: str, # Model parameters make_source: MakeModelFn, source_init: bool, make_trainer: MakeTrainerFn, do_training: DoTrainingFn, # Logging log_dir: str, checkpoint_interval: int, ) -> V: """Train a model on target and save the results, reporting training stats.""" # This venv is needed by serialize.load_reward, but is never stepped. venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)]) with networks.make_session() as (_, sess): tf.random.set_random_seed(seed) with tf.variable_scope("source") as model_scope: model = make_source(venv) with tf.variable_scope("target"): target = serialize.load_reward(target_reward_type, target_reward_path, venv, discount) with tf.variable_scope("train") as train_scope: trainer = make_trainer(model, model_scope, target) # Do not initialize any variables from target, which have already been # set during serialization. init_vars = train_scope.global_variables() if source_init: init_vars += model_scope.global_variables() sess.run(tf.initializers.variables(init_vars)) def callback(epoch: int) -> None: if checkpoint_interval > 0 and epoch % checkpoint_interval == 0: trainer.model.save( os.path.join(log_dir, "checkpoints", f"{epoch:05d}")) stats = do_training(target, trainer, callback) # Trainer may wrap source, so save `trainer.model` not source directly # (see e.g. RegressWrappedModel). trainer.model.save(os.path.join(log_dir, "checkpoints", "final")) with open(os.path.join(log_dir, "stats.pkl"), "wb") as f: pickle.dump(stats, f) return stats
def get_affine_from_models(env_name: str, paths: Iterable[str]): """Extract affine parameters from reward model.""" venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)]) res = {} with networks.make_session(): for path in paths: model = serialize.load_reward( "evaluating_rewards/RewardModel-v0", os.path.join(path, "model"), venv, ) return model.models["wrapped"][0].get_weights() return res
def test_regress( graph: tf.Graph, session: tf.Session, target: str, loss_ub: float, rel_loss_lb: float, discount: float = 0.99, ): """Test regression onto target. Args: target: The target reward model type. Must be a hardcoded reward: we always load with a path "dummy". loss_ub: The maximum loss of the model at the end of training. rel_loss_lb: The minimum relative improvement to the initial loss. """ env_name = "evaluating_rewards/PointMassLine-v0" venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)]) with datasets.transitions_factory_from_random_model( env_name) as dataset_generator: with graph.as_default(): with session.as_default(): with tf.variable_scope("source") as source_scope: source = base.MLPRewardModel(venv.observation_space, venv.action_space) with tf.variable_scope("target"): target_model = serialize.load_reward( target, "dummy", venv, discount) with tf.variable_scope("match") as match_scope: match = comparisons.RegressModel(source, target_model) init_vars = source_scope.global_variables( ) + match_scope.global_variables() session.run(tf.initializers.variables(init_vars)) stats = match.fit(dataset_generator, total_timesteps=1e5, batch_size=512) loss = pd.DataFrame(stats["loss"])["singleton"] logging.info(f"Loss: {loss.iloc[::10]}") initial_loss = loss.iloc[0] logging.info(f"Initial loss: {initial_loss}") final_loss = loss.iloc[-10:].mean() logging.info(f"Final loss: {final_loss}") assert initial_loss / final_loss > rel_loss_lb assert final_loss < loss_ub
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): """Checks that reward models predictions match those of Gym reward.""" # Generate rollouts, recording Gym reward policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024) gym_reward = transitions.rews # Make predictions using reward model with graph.as_default(), session.as_default(): reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0) pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"] # Are the predictions close to true Gym reward? np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
def load_models( env_name: str, discount: float, reward_cfgs: Iterable[common_config.RewardCfg], ) -> Mapping[common_config.RewardCfg, base.RewardModel]: """Load models specified by the `reward_cfgs`. Args: - env_name: The environment name in the Gym registry of the rewards to compare. - discount: Discount to use for reward models (mostly for shaping). - reward_cfgs: Iterable of reward configurations. Returns: A mapping from reward configurations to the loaded reward model. """ venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)]) return {(kind, path): serialize.load_reward(kind, path, venv, discount) for kind, path in reward_cfgs}
def make_source(venv): kind, path = source_reward_cfg return serialize.load_reward(kind, path, venv, discount)
def load_monte_carlo_greedy(path: str, env: vec_env.VecEnv) -> MonteCarloGreedyPolicy: reward_type, reward_path, discount = path.split(":") reward_model = serialize.load_reward(reward_type, reward_path, env, float(discount)) return MonteCarloGreedyPolicy(env, reward_model=reward_model)
def make_source(venv): return serialize.load_reward(source_reward_type, source_reward_path, venv, discount)
def test_sample_canon_shaping( graph: tf.Graph, session: tf.Session, discount: float, eps: float = 1e-4, ): """Tests canonical_sample.sample_canon_shaping. Specifically, verifies that sparse, sparse affine-transformed and dense rewards in PointMass compare equal (distance < eps); and than sparse and the ground-truth (norm) reward are unequal (distance > 0.1). """ venv = vec_env.DummyVecEnv( [lambda: gym.make("evaluating_rewards/PointMassLine-v0")]) reward_types = [ "evaluating_rewards/PointMassSparseWithCtrl-v0", "evaluating_rewards/PointMassDenseWithCtrl-v0", "evaluating_rewards/PointMassGroundTruth-v0", ] with graph.as_default(): with session.as_default(): models = { k: serialize.load_reward(k, "dummy", venv, discount) for k in reward_types } constant = rewards.ConstantReward(venv.observation_space, venv.action_space) constant.constant.set_constant(42.0) models["big_sparse"] = rewards.LinearCombinationModelWrapper({ "model": ( models["evaluating_rewards/PointMassSparseWithCtrl-v0"], tf.constant(10.0), ), "shift": (constant, tf.constant(1.0)), }) with datasets.sample_dist_from_space(venv.observation_space) as obs_dist: with datasets.sample_dist_from_space(venv.action_space) as act_dist: with datasets.transitions_factory_iid_from_sample_dist( obs_dist, act_dist) as iid_generator: batch = iid_generator(256) canon_rew = epic_sample.sample_canon_shaping( models, batch, act_dist, obs_dist, n_mean_samples=256, discount=discount, ) sparse_vs_affine = tabular.direct_distance( canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"], canon_rew["big_sparse"], p=1, ) assert sparse_vs_affine < eps sparse_vs_dense = tabular.direct_distance( canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"], canon_rew["evaluating_rewards/PointMassDenseWithCtrl-v0"], p=1, ) assert sparse_vs_dense < eps sparse_vs_gt = tabular.direct_distance( canon_rew["evaluating_rewards/PointMassSparseWithCtrl-v0"], canon_rew["evaluating_rewards/PointMassGroundTruth-v0"], p=1, ) assert sparse_vs_gt > 0.1