def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
eval_callback = EvalCallback(eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False,n_eval_episodes=1) model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98, gamma=0.999, learning_rate=1e-4, noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs = dict(layers=[400, 300])) #model = PPO2.load("TestHover", env=env, tensorboard_log="./rocket_tensorboard/") #while True: #model.learning_rate = 3e-5 model.learn(total_timesteps=5000000,callback=eval_callback) model.save("TestHover") env.save("TestHover_env") del model # remove to demonstrate saving and loading model = PPO2.load("TestHover", env=eval_env) # Enjoy trained agent obs = eval_env.reset() data=[] time=[] actions=[] alt_reward = [] mix_reward = [] temp_reward = [] valveChange = [] speedPunishes = []
class RocketTrainer: def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"): self.agent_name = agent_name #self.env = LearningRocket(visualize=False) #self.env = NormalizeActionWrapper(self.env) #self.eval_env = LearningRocket(visualize=True) #self.eval_env = NormalizeActionWrapper(self.eval_env) #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)]) self.env = make_vec_env( LearningRocket, n_envs=16 ) #[lambda: LearningRocket(visualize=False) for i in range(16)])) #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)])) self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #self.eval_env = VecNormalize(self.eval_env) self.eval_callback = EvalCallback(self.eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300]) #check_env(self.env, warn=True) """ if algorithm == "SAC": if load is True: self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #self.model.ent_coef=0.2 else: self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5) print("Trainer Set for SAC") """ if algorithm == "TD3": n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load is True: self.model = TD3.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #file = open('replay_buffer', 'rb') #self.model.replay_buffer = pickle.load(file) #file.close() else: self.model = TD3(MlpPolicy, self.env, action_noise=action_noise, batch_size=768, gamma=0.95, learning_rate=1e-4, learning_starts=20000, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) print("Trainer Set for TD3") elif algorithm == "PPO2": if load is True: self.model = PPO2.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") self.eval_env = VecNormalize.load(self.agent_name + "vEnv", self.eval_env) #self.eval_env.clip_obs = 500 #self.env = VecNormalize(self.env) self.env = VecNormalize.load(self.agent_name + "vEnv", self.env) #self.env.clip_obs = 500 #self.env.norm_obs = False #self.eval_env.norm_obs = False else: self.model = PPO2(PPOMlpPolicy, self.env, n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) self.eval_env = VecNormalize(self.eval_env) self.env = VecNormalize(self.env) #self.eval_env.clip_obs = 500 #self.env.clip_obs = 500 #self.env.norm_obs=False #self.eval_env.norm_obs=False print("Trainer set for PPO2. I am speed.") def train(self, visualize=False, lesson_length=100000, lessons=1): print("Today I'm teaching rocket science. How hard can it be?") #self.env.render(visualize) for i in range(lessons): print("*sigh* here we go again.") self.model.learn( total_timesteps=lesson_length, callback=self.eval_callback) #,callback=self.eval_callback) self.model.save(self.agent_name) self.env.save(self.agent_name + "vEnv") #self.eval_env = VecNormalize.load(self.agent_name + "vEnv",self.eval_env) #a_file = open('replay_buffer', 'wb') #pickle.dump(self.model.replay_buffer, a_file) #a_file.close() print("{} Batches Done.".format(i + 1)) # plt.close() mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=1) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") self.evaluate() def lecture(self): teacher = DummyExpert() #teacher = NormalizeActionWrapper(teacher) print("Let me show you how it's done.") generate_expert_traj(teacher.teach, 'dummy_expert_rocket', self.env, n_episodes=10) def evaluate(self): self.eval_env.training = False self.eval_env.norm_reward = False print("Watch this!") obs = self.eval_env.reset() #self.eval_env.render(True) mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=1) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") reward_list = [] reward_sum: List[float] = [] action_list = [] for i in range(3): action_list.append([]) Time = [] steps = 0 cumulativeReward = 0 data = [] for i in range(obs.size): data.append([]) for j in range(1000): action, states = self.model.predict(obs, deterministic=True) obs, reward, done, info = self.eval_env.step(action) #re_obs = self.eval_env.rescale_observation((obs)) #obs = self.eval_env.get_original_obs() #action = self.eval_env.rescale_action(action) reward_list.append(reward[0]) cumulativeReward += reward[0] reward_sum.append(cumulativeReward) action_list[0].append(action[0]) #for i in range(3): # action_list[i].append(action[i]) for i in range(obs.size): data[i].append(obs[0][i]) steps += 1 Time.append(steps) print("Another happy landing.") plt.figure(figsize=(11, 8)) plt.subplot(3, 2, 3) plt.xlabel('Time(s)') plt.ylabel('Position (m)') plt.plot(Time, data[0], label='X Position') plt.plot(Time, data[1], label='Speed') #plt.plot(Time, data[2], label='Z Position') plt.legend(loc='best') plt.subplot(3, 2, 1) plt.xlabel('Time(s)') plt.ylabel('Reward') plt.plot(Time, reward_list, label='Reward') plt.plot(Time, reward_sum, label='Total Reward') plt.legend(loc='best') plt.subplot(3, 2, 2) plt.xlabel('Time(s)') plt.ylabel('Actions') plt.plot(Time, action_list[0], label='Thrust') #plt.plot(Time, action_list[1], label='GimbalX') #plt.plot(Time, action_list[2], label='GimbalY') plt.legend(loc='best') plt.subplot(3, 2, 4) plt.xlabel('Time(s)') plt.ylabel('Attitude') #plt.plot(Time, data[4], label='Roll') #plt.plot(Time, data[4], label='Pitch') #plt.plot(Time, data[5], label='Yaw') plt.legend(loc='best') plt.subplot(3, 2, 5) plt.xlabel('Time(s)') plt.ylabel('Velocity') #plt.plot(Time, data[2], label='vX') #plt.plot(Time, data[3], label='vY') #plt.plot(Time, data[5], label='vZ') plt.legend(loc='best') plt.subplot(3, 2, 6) plt.xlabel('Time(s)') plt.ylabel('RotVel') #plt.plot(Time, data[12], label='Fuel') #plt.plot(Time, data[6], label='Rot X') #plt.plot(Time, data[7], label='Rot Y') plt.legend(loc='best') plt.tight_layout() plt.show()
def train(method="SAC"): def get_multi_process_env(num_of_envs, subprocess=True, amplitude_scaling=False, frameskip=5, with_goals=False, action_type=ActionType.POSITION, difficulty=1, initializer="random", testing=False): if initializer == "random": initializer = RandomInitializer(difficulty=difficulty) elif initializer == "completely_random": initializer = CompletelyRandomInitializer() def _make_env(rank): def _init(): obs_type = ObservationType.WITH_GOALS if with_goals else ObservationType.WITHOUT_GOALS out_env = CubeEnv(frameskip=frameskip, visualization=False, initializer=initializer, action_type=action_type, observation_type=obs_type, testing=testing) out_env.seed(seed=54321) out_env.action_space.seed(seed=54321) if not with_goals: out_env = FlatObservationWrapper( out_env, amplitude_scaling=amplitude_scaling) out_env = TimeFeatureWrapper(out_env, max_steps=math.ceil( 3750 / frameskip)) else: out_env = GoalObservationWrapper( out_env, amplitude_scaling=amplitude_scaling) return out_env return _init if subprocess: return SubprocVecEnv( [_make_env(rank=i) for i in range(num_of_envs)]) else: return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) date_time_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S_") print(method, date_time_str) set_global_seeds(0) if method == "HER": env = get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, with_goals=True) env.set_attr("reward_range", 1000) policy_kwargs = dict(layers=[128, 128], act_fun=tf.tanh) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions)) model = HER("MlpPolicy", env, SAC, policy_kwargs=policy_kwargs, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, tensorboard_log="tblogs", batch_size=512, buffer_size=100000, gamma=0.98, learning_starts=10000, random_exploration=0.15) model.learn(int(2e6), log_interval=10, callback=CheckpointCallback( save_freq=int(1e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) if method == "SAC": env = VecNormalize(VecFrameStack( get_multi_process_env(1, subprocess=False, amplitude_scaling=False, frameskip=5, action_type=ActionType.POSITION, difficulty=1, initializer="completely_random"), 4), norm_reward=False, clip_reward=1500, gamma=0.99) policy_kwargs = dict(layers=[256, 256]) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions)) model = SAC("LnMlpPolicy", env, policy_kwargs=policy_kwargs, buffer_size=1000000, batch_size=256, gamma=0.99, learning_rate=LinearSchedule(int(2e6), 5e-5, initial_p=3e-4).value, train_freq=64, gradient_steps=4, tau=0.005, learning_starts=10000, tensorboard_log="tblogs", verbose=1, use_emph_exp=True, action_noise=action_noise) model.learn(int(2e6), log_interval=10, callback=CheckpointCallback( save_freq=int(5e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) env.save("normalized_env_" + date_time_str) if method == "CONTINUE_SAC": difficulty = 4 env = VecNormalize.load( "models/normalized_env_frame_stacked_model", VecFrameStack( get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, action_type=ActionType.POSITION, difficulty=difficulty, initializer="random", testing=True), 4)) model = SAC.load( "models/checkpoint_saves/SAC_09_18_2020_19_07_42__1000000_steps.zip", env=env, tensorboard_log="tblogs", ) model.learn(int(1e6), log_interval=10, callback=CheckpointCallback( save_freq=int(5e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) env.save("normalized_env_difficulty_" + str(difficulty)) model.save( os.path.join('models', "model_difficulty_" + str(difficulty))) if method == "save_vec_env": env = VecNormalize( get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, action_type=ActionType.POSITION, difficulty=1, initializer="completely_random")) model = SAC.load( "models/checkpoint_saves/SAC_09_18_2020_14_27_30__2000000_steps.zip", env=env) model.learn(int(1e5), log_interval=1) env.save("normalized_env_without_framestack") return else: return print("save model: ", os.path.join('models', method + '_' + date_time_str))
def train(params, model=None, env=None): print("Training Parameters: ", params) data_dir, tb_path = get_paths(params) os.makedirs(data_dir, exist_ok=True) # Save parameters immediately params.save(data_dir) rank = mpi_rank_or_zero() if rank != 0: logger.set_level(logger.DISABLED) # Create the environment if not given if env is None: def make_env(i): env = get_env(params) print("ENV IN UTIL" ,env) # TODO: make monitor work for multiple agent. env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset']) return env # if 'PPO' in params['alg']: # env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) # else: # env = make_env(0) env = make_env(0) if params['normalize']: env = VecNormalize(env) # Set the seeds if params['seed']: seed = params['seed'] + 100000 * rank set_global_seeds(seed) params['alg_args']['seed'] = seed if 'noise' in params and params['noise']: from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions)) print("ENV", env, env.action_space) if model is None: alg = get_alg(params) policy = get_policy(params) model = alg(policy, env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model.set_env(env) print("\n===============================\n") print("TENSORBOARD PATH:", tb_path) print("\n===============================\n") model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, params, env, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq'])) print("Saving model to", data_dir) model.save(data_dir +'/final_model') if params['normalize']: env.save(data_dir + '/environment.pkl') env.close()