def test_monitor_load_results(tmp_path): """ test load_results on log files produced by the monitor wrapper """ tmp_path = str(tmp_path) env1 = gym.make("CartPole-v1") env1.seed(0) monitor_file1 = os.path.join( tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env1 = Monitor(env1, monitor_file1) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 1 assert monitor_file1 in monitor_files monitor_env1.reset() episode_count1 = 0 for _ in range(1000): _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) if done: episode_count1 += 1 monitor_env1.reset() results_size1 = len(load_results(os.path.join(tmp_path)).index) assert results_size1 == episode_count1 env2 = gym.make("CartPole-v1") env2.seed(0) monitor_file2 = os.path.join( tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env2 = Monitor(env2, monitor_file2) monitor_files = get_monitor_files(tmp_path) assert len(monitor_files) == 2 assert monitor_file1 in monitor_files assert monitor_file2 in monitor_files monitor_env2.reset() episode_count2 = 0 for _ in range(1000): _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) if done: episode_count2 += 1 monitor_env2.reset() results_size2 = len(load_results(os.path.join(tmp_path)).index) assert results_size2 == (results_size1 + episode_count2) os.remove(monitor_file1) os.remove(monitor_file2)
def test(model, test_images): test_env = Monitor( PuzzleEnv(images=test_images, img_size=IMG_SIZE, channel_num=CHANNEL_NUM, puzzle_size=(3, 3), puzzle_type="switch", dist_type="manhattan", penalty_for_step=-0.2, reward_for_completiton=20, positive_reward_coefficient=1.0, obs_conf=OBS_CONF)) solutions = [] rews = [] steps = [] sample = len(test_images) errors = 0 for iter in range(sample): i = 0 done = False obs = test_env.reset() frames = [obs] while not done: i += 1 action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) frames.append(obs) rews.append(rewards) if i == 10000: errors += 1 break solutions.append(frames) done = False print(i, sum(rews), rews) rews = [] steps.append(i) print('Average steps taken: ', sum(steps) / sample) print('Median of steps taken: ', statistics.median(steps)) print('Number of errors: ', errors) plt.hist(steps, bins=9) plt.savefig('fig.png')
def test_monitor(tmp_path): """ Test the monitor wrapper """ env = gym.make("CartPole-v1") env.seed(0) monitor_file = os.path.join( str(tmp_path), "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) monitor_env = Monitor(env, monitor_file) monitor_env.reset() total_steps = 1000 ep_rewards = [] ep_lengths = [] ep_len, ep_reward = 0, 0 for _ in range(total_steps): _, reward, done, _ = monitor_env.step( monitor_env.action_space.sample()) ep_len += 1 ep_reward += reward if done: ep_rewards.append(ep_reward) ep_lengths.append(ep_len) monitor_env.reset() ep_len, ep_reward = 0, 0 monitor_env.close() assert monitor_env.get_total_steps() == total_steps assert sum(ep_lengths) == sum(monitor_env.get_episode_lengths()) assert sum(monitor_env.get_episode_rewards()) == sum(ep_rewards) _ = monitor_env.get_episode_times() with open(monitor_file, "rt") as file_handler: first_line = file_handler.readline() assert first_line.startswith("#") metadata = json.loads(first_line[1:]) assert metadata["env_id"] == "CartPole-v1" assert set(metadata.keys()) == {"env_id", "t_start" }, "Incorrect keys in monitor metadata" last_logline = pandas.read_csv(file_handler, index_col=None) assert set( last_logline.keys()) == {"l", "t", "r"}, "Incorrect keys in monitor logline" os.remove(monitor_file)
def learn(self, initial_models): mesa_algo = TD3( "MlpPolicy", self.env, verbose=1, learning_starts=1 ) # Note: Unecessarily initializes parameters (could speed up a bit by fixing)' mesa_algo.set_parameters(to_torch(initial_models), exact_match=False) LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/" MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/" callback_list = [] callback_list.append(TensorboardCallback()) callback_list.append( StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1)) """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR, deterministic=True, eval_freq=5, n_eval_episodes=1))""" mesa_algo.learn(total_timesteps=1000, callback=callback_list ) #rospy.get_param("/hyperparameters/total_timesteps") print("finished training! Testing mesa network...") test_buffer = ReplayBuffer(100, TaskEnv.observation_space, TaskEnv.action_space, device="cuda") test_env = Monitor(self.env) done = False ob = test_env.reset() while not done: action, state = mesa_algo.predict(ob) next_ob, reward, done, info = test_env.step(action) test_buffer.add(ob, next_ob, action, reward, done, [info]) ob = next_ob meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer} optimized_mesa_parameters = mesa_algo.get_parameters() tf_mesa_models = from_torch(optimized_mesa_parameters) return meta_buffer, tf_mesa_models
verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, tensorboard_log="./her_overcooked", batch_size=256, online_sampling=online_sampling, action_noise=action_noise, # policy_kwargs=dict(net_arch=[256, 256, 256]), ) # model = HER.load('./her_bit_env250.zip', env=env) # Train the model for i in range(1000): model.learn(10000) model.save(f"./her_bit_env{i}") # model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
class OffPolicy_BaseLine(RLSPAgent): """ RLSP DDPG Agent This class creates a DDPG agent with params for RLSP """ def __init__(self, agent_helper): self.agent_helper = agent_helper # create model #TODO: add number of env for multiple processing later for faster traing: self.create() pass def create(self, n_envs=1): """Create the agent""" self.env = self.agent_helper.env log_dir = self.agent_helper.config_dir os.makedirs(log_dir, exist_ok=True) self.env = Monitor(self.env, log_dir) #TODO: # Create DDPG policy and define its hyper parameter here! even the action space and observation space. # add policy policy_name = self.agent_helper.config['policy'] self.policy = eval(policy_name) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) n_actions = int(self.agent_helper.env.action_space.shape[0]) action_noise = NormalActionNoise( mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions)) #FIXME: test: # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path) # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct? # activ_function_name = self.agent_helper.config['nn_activ'] # activ_function = eval(activ_function_name) # policy_kwargs = dict(activation_fn=activ_function, # net_arch=[dict(pi=[32, 32], qf=[32, 32])]) policy_kwargs = dict(net_arch=self.agent_helper.config['layers']) self.model = OffPolicyAlgorithm( self.policy, self.env, learning_rate=self.agent_helper.config['learning_rate'], buffer_size=self.agent_helper.config['buffer_size'], batch_size=self.agent_helper.config['batch_size'], tau=self.agent_helper.config['tau'], gamma=self.agent_helper.config['gamma'], gradient_steps=self.agent_helper.config['gradient_steps'], action_noise=action_noise, optimize_memory_usage=self.agent_helper. config['optimize_memory_usage'], create_eval_env=self.agent_helper.config['create_eval_env'], policy_kwargs=policy_kwargs, verbose=self.agent_helper.config['verbose'], learning_starts=self.agent_helper.config['learning_starts'], tensorboard_log=self.agent_helper.graph_path, seed=self.agent_helper.seed) pass def test_env(self): logger.info(f"Model: {self.model.get_env()}") def fit(self, env, episodes, verbose, episode_steps, callbacks, log_interval, agent_id=-1): """Mask the agent fit function To train the agent """ logger.info("herer") # self.model.learn(total_timesteps=100, log_interval=10) #FIXME: use the tb logname meaningful! #TODO: Write callback funcs here: # List of callback: # Checkpoint Callback: save the model every 10 episodes. checkpoint_callback = CheckpointCallback( save_freq=96, save_path=self.agent_helper.config_dir, name_prefix='rl_model') # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path. eval_env = env eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70, verbose=1) eval_callback_reward_threshold = EvalCallback( eval_env, callback_on_new_best=callback_on_best, verbose=1) # EveryNTimeSteps: to call every n time steps to save the model. checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback_after_n_steps = EveryNTimesteps( n_steps=500, callback=checkpoint_on_event) # StopTrainingOnMaxEpisodes: # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) # CallbackList: to call several callback together. callbacklist = CallbackList([checkpoint_callback, eval_callback]) logger.info(f"Model: {self.model.get_env()}") with ProgressBarManager(log_interval) as progress_callback: self.model.learn(total_timesteps=log_interval, callback=[progress_callback, checkpoint_callback]) # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10) # self.eval_writer(mean_reward, std_reward) pass def test(self, env, episodes, verbose, episode_steps, callbacks, sim): """Mask the agent fit function""" logger.info(f"episodes: {episodes}, episode_steps: {episode_steps}") if self.agent_helper.train: # Create a fresh simulator with test argument logger.info("Create new Environment!") self.agent_helper.env.simulator = create_simulator( self.agent_helper) obs = self.env.reset() self.setup_writer() self.setup_run_writer() episode = 1 step = 0 episode_reward = 0.0 done = False # action, _states = self.model.predict(obs) # obs, reward, dones, info = self.env.step(action) # logger.info(f"info: {info}") # Test for 1 episode while not done: action, _states = self.model.predict(obs) obs, reward, dones, info = self.env.step(action) episode_reward += reward self.write_run_reward(step, reward) if sim: step = info['sim_time'] if step >= (self.agent_helper.episode_steps * self.agent_helper.n_steps_per_episode): done = True self.write_reward(episode, episode_reward) else: step = info['step'] if step >= self.agent_helper.episode_steps: done = True self.write_reward(episode, episode_reward) # episode += 1 # # sys.stdout.write( # "\rTesting:" + # f"Current Simulator Time: {step}. Testing duration: {self.agent_helper.episode_steps}\n") # sys.stdout.flush() # print("") pass def save_weights(self, file, overwrite=True): weights_file = f"{file}weights" dir_path = os.path.dirname(os.path.realpath(weights_file)) os.makedirs(dir_path, exist_ok=True) # After training is done, we save the final weights in the result_base_path. logger.info("saving model and weights to %s", weights_file) # self.agent.save_weights(weights_file, overwrite) self.model.save(weights_file) pass def load_weights(self, weights_file): """ Load the model from a zip archive """ self.model = OffPolicyAlgorithm.load(weights_file) pass def setup_writer(self): episode_reward_filename = f"{self.agent_helper.config_dir}/episode_reward.csv" episode_reward_header = ['episode', 'reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) def setup_run_writer(self): run_reward_filename = f"{self.agent_helper.config_dir}/run_reward.csv" run_reward_header = ['run', 'reward'] self.run_reward_stream = open(run_reward_filename, 'a+', newline='') self.run_reward_writer = csv.writer(self.run_reward_stream) self.run_reward_writer.writerow(run_reward_header) def write_reward(self, episode, reward): self.episode_reward_writer.writerow([episode, reward]) def write_run_reward(self, step, reward): self.run_reward_writer.writerow([step, reward]) def eval_writer(self, mean_reward, std_reward): episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv" episode_reward_header = ['mean_reward', 'std_reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) self.episode_reward_writer.writerow([mean_reward, std_reward]) def eval_writer(self, mean_reward, std_reward): episode_reward_filename = f"{self.agent_helper.config_dir}evaluate_agent.csv" episode_reward_header = ['mean_reward', 'std_reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) self.episode_reward_writer.writerow([mean_reward, std_reward])