def train(env, log_dir): callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) env = VecNormalize(env, training=True, norm_obs=True, norm_reward=True, gamma=0.9997, clip_obs=10., clip_reward=10., epsilon=0.1) drive = PPO("MlpPolicy", env, ent_coef=0.01, vf_coef=1, batch_size=32, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.1), n_steps=1000, n_epochs=20, tensorboard_log=log_dir + "/drive_tensorboard_log", verbose=1) drive.learn(total_timesteps=total_timesteps, callback=callback) for i in range(total_train_runs): env.close() drive.learn(total_timesteps=total_timesteps, callback=callback, reset_num_timesteps=False) drive.save("conduziadrive")
new_values - b_values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - b_returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ( (new_values - b_returns[minibatch_ind])**2).mean() loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) envs.close() writer.close()
class MultiModuleExp: """ A whole experiment. It should contain: (1) environments, (2) policies, (3) training, (4) testing. The results should be able to compare with other experiments. The Multi-RNN experiment. """ def __init__( self, args, env_id="HopperBulletEnv-v0", features_extractor_class=MultiExtractor, features_extractor_kwargs={}, ) -> None: print("Starting MultiModuleExp") """ Init with parameters to control the training process """ self.args = args self.env_id = env_id self.use_cuda = torch.cuda.is_available() and args.cuda self.device = torch.device("cuda" if self.use_cuda else "cpu") # Make Environments print("Making train environments...") venv = DummyVecEnv([ make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render) for i in range(args.num_envs) ]) self.eval_env = DummyVecEnv( [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)]) if args.vec_normalize: venv = VecNormalize(venv) self.eval_env = VecNormalize(self.eval_env, norm_reward=False) features_extractor_kwargs["num_envs"] = args.num_envs policy_kwargs = { "features_extractor_class": features_extractor_class, "features_extractor_kwargs": features_extractor_kwargs, # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor. # pi: Actor (policy-function); vf: Critic (value-function) "net_arch": [dict(pi=[64, 64], vf=[64, 64])], } self.model = CustomizedPPO( CustomizedPolicy, venv, n_steps=args.rollout_n_steps, tensorboard_log="tb", policy_kwargs=policy_kwargs, device=self.device, verbose=1, rnn_move_window_step=args.rnn_move_window_step, rnn_sequence_length=args.rnn_sequence_length, use_sde=args.sde, n_epochs=args.n_epochs) def train(self) -> None: """ Start training """ print(f"train using {self.model.device.type}") callback = [ DebugCallback("Customized"), AdjustCameraCallback(), WandbCallback(self.args), CustomizedEvalCallback( self.eval_env, best_model_save_path=None, log_path=None, eval_freq=self.args.eval_freq, n_eval_episodes=3, verbose=0, ) ] self.model.learn(self.args.total_timesteps, callback=callback) def test(self, model_filename, vnorm_filename): self.model.load(model_filename) self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env) self.eval_env.render() obs = self.eval_env.reset() with self.model.policy.features_extractor.start_testing(): for i in range(1000): action = self.model.predict(obs, deterministic=True) self.eval_env.step(action) self.eval_env.close()