def _make_trainer(self, train_env, eval_env, output_dir, model=None): if model is None: model = lambda: [layers.Dense(1)] return ppo_trainer.PPO( train_env=train_env, eval_env=eval_env, policy_and_value_model=model, n_optimizer_steps=1, output_dir=output_dir, random_seed=0, boundary=2, )
def _run_training_loop(self, train_env, eval_env, output_dir, model=None): if model is None: model = lambda: [layers.Dense(1)] n_epochs = 2 # Run the training loop. trainer = ppo_trainer.PPO( train_env=train_env, eval_env=eval_env, policy_and_value_model=model, n_optimizer_steps=1, output_dir=output_dir, random_seed=0, boundary=2, ) trainer.training_loop(n_epochs=n_epochs)
def _make_trainer(self, train_env, eval_env, output_dir, model=None, **kwargs): if model is None: model = lambda: layers.Serial(layers.Dense(1)) return ppo_trainer.PPO(train_env=train_env, eval_env=eval_env, policy_and_value_model=model, n_optimizer_steps=1, output_dir=output_dir, random_seed=0, max_timestep=3, boundary=2, save_every_n=1, **kwargs)