def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
model = PPO.load(load_model_for_training_path, env=env) else: model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log_folder) eval_env_func = make_training_env(env_id, options, rank=num_cpu) eval_env = DummyVecEnv([eval_env_func]) eval_env = VecNormalize(eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/', log_path='./logs_best_model/', deterministic=True, render=False, n_eval_episodes=10) model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=eval_callback) model.save(save_model_path) env.save(save_vecnormalize_path) else: options['has_renderer'] = True register_gripper(UltrasoundProbeGripper) env_gym = GymWrapper(suite.make(env_id, **options)) env = DummyVecEnv([lambda : env_gym]) model = PPO.load(load_model_path) env = VecNormalize.load(load_vecnormalize_path, env) env.training = False env.norm_reward = False obs = env.reset() eprew = 0
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
def main(): # multiprocess environment n_cpu = 8 env = SubprocVecEnv( [lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) # n_cpu = 1 # env = gym.make('DYROSTocabi-v1') # env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=True) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-02-27 02:20:20.015346" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def main(): if(StartFresh): # Create Environment env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto") policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])]) model = PPO('MlpPolicy', env, learning_rate = 3e-5, n_steps=512, batch_size=128, n_epochs=20, gamma=0.99, gae_lambda = 0.9, clip_range = 0.4, vf_coef = 0.5, use_sde = True, sde_sample_freq = 4, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) # nStep_callback_list = CallbackList([envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) # callbacks = CallbackList([checkpoint_callback, eval_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)