def test_save_callback(self): ''' Test that the model performance can be monitored and results can be checked and saved as the model improves. This test trains an agent for a short period of time, without loading a pre-trained model. Therefore, this test also checks that a RL from stable-baselines can be trained. ''' # Define logging directory. Monitoring data and agent model will be stored here log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents', 'monitored_A2C') # Perform a short training example with callback env, _, _ = run_save_callback.train_A2C_with_callback( log_dir=log_dir, tensorboard_log=None) # Load the trained agent model = A2C.load(os.path.join(log_dir, 'best_model')) # Test one step with the trained model obs = env.reset() df = pd.DataFrame([model.predict(obs)[0][0]], columns=['value']) df.index.name = 'keys' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'save_callback.csv') self.compare_ref_values_df(df, ref_filepath) # Remove model to prove further testing shutil.rmtree(log_dir, ignore_errors=True)
def train_PPO2_predictive(start_time_tests = [31*24*3600, 304*24*3600], episode_length_test = 14*24*3600, load = False): '''Method to train (or load a pre-trained) PPO2 agent. Testing periods have to be introduced already here to not use these during training. Parameters ---------- start_time_tests : list of integers Time in seconds from the beginning of the year that will be used for testing. These periods should be excluded in the training process. By default the first day of February and the first day of November are used. episode_length_test : integer Number of seconds indicating the length of the testing periods. By default two weeks are reserved for testing. load : boolean Boolean indicating whether the algorithm is loaded (True) or needs to be trained (False) ''' excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append((start_time_test,start_time_test+episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173*24*3600, 266*24*3600)) env = BoptestGymEnvRewardWeightCost(url = url, actions = ['oveHeaPumY_u'], observations = {'reaTZon_y': (280.,310.), 'LowerSetp[1]':(280.,310.), 'UpperSetp[1]':(280.,310.), 'TDryBul': (250.,310.), 'HGloHor': (0., 1000.)}, random_start_time = True, excluding_periods = excluding_periods, forecasting_period = 1*24*3600, max_episode_length = 1*24*3600, warmup_period = 3*3600, Ts = 900) env = NormalizedObservationWrapper(env) env = NormalizedActionWrapper(env) model = PPO2('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed, tensorboard_log=os.path.join('results')) if not load: model.learn(total_timesteps=int(1e5)) # Save the agent model.save(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'ppo2_pred_bestest_hydronic_heatpump')) else: # Load the trained agent model = PPO2.load(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'ppo2_pred_bestest_hydronic_heatpump')) return env, model, start_time_tests
def test_variable_episode(self): ''' Test that a model can be trained using variable episode length. The method that is used to determine whether the episode is terminated or not is defined by the user. This test trains an agent for a short period of time, without loading a pre-trained model. Therefore, this test also checks that a RL from stable-baselines can be trained. This test also uses the save callback to check that the variable episode length is being effectively used. Notice that this test also checks that child classes can be nested since the example redefines the `compute_reward` and the `compute_done` methods. ''' # Define logging directory. Monitoring data and agent model will be stored here log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents', 'variable_episode_A2C') # Perform a short training example with callback env, _, _ = run_variable_episode.train_A2C_with_variable_episode( log_dir=log_dir, tensorboard_log=None) # Load the trained agent model = A2C.load(os.path.join(log_dir, 'best_model')) # Test one step with the trained model obs = env.reset() df = pd.DataFrame([model.predict(obs)[0][0]], columns=['value']) df.index.name = 'keys' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'variable_episode_step.csv') self.compare_ref_values_df(df, ref_filepath) # Check variable lengths monitor = pd.read_csv(os.path.join(log_dir, 'monitor.csv'), index_col=None) monitor = monitor.iloc[1:] monitor.reset_index(inplace=True) monitor.columns = ['reward', 'episode_length', 'time'] # Time may vary from one computer to another monitor.drop(labels='time', axis=1, inplace=True) # Utilities require index to have time as index name (even this is not the case here) monitor.index.name = 'time' # Transform to numeric monitor = monitor.apply( lambda col: pd.to_numeric(col, errors='coerce')) # Check that we obtain always same monitoring parameters ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'variable_episode_monitoring.csv') self.compare_ref_timeseries_df(monitor, ref_filepath) # Remove model to prove further testing shutil.rmtree(log_dir, ignore_errors=True)
def check_obs_act_rew_kpi(self, obs=None, act=None, rew=None, kpi=None, label='default'): '''Auxiliary method to check for observations, actions, rewards, and/or kpis of a particular test case run. ''' # Check observation values if obs is not None: df = pd.DataFrame(obs) df.index.name = 'time' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'observations_{}.csv'.format(label)) self.compare_ref_timeseries_df(df, ref_filepath) # Check actions values if act is not None: df = pd.DataFrame(act) df.index.name = 'time' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'actions_{}.csv'.format(label)) self.compare_ref_timeseries_df(df, ref_filepath) # Check reward values if rew is not None: df = pd.DataFrame(rew) df.index.name = 'time' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'rewards_{}.csv'.format(label)) self.compare_ref_timeseries_df(df, ref_filepath) if kpi is not None: df = pd.DataFrame(data=[kpi]).T df.columns = ['value'] df.index.name = 'keys' # Time ratio is not checked since depends on the machine where tests are run df.drop('time_rat', inplace=True) ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'kpis_{}.csv'.format(label)) self.compare_ref_values_df(df, ref_filepath)
def test_behavior_cloning_disc(self): '''Check that an agent using discrete action space (in this case we use DQN) can be pretrained using behavior cloning from an expert trajectory that needs to be generated beforehand. The test pretrains the agent with 1000 epochs and directly tests its performance without further learning. ''' expert_traj = os.path.join(utilities.get_root_path(), 'examples', 'trajectories', 'expert_traj_disc_28.npz') self.partial_test_RL(case='D', algorithm='DQN', mode='train', training_timesteps=0, expert_traj=expert_traj)
def test_reset_fixed(self): '''Test that the environment can reset using a fixed start time and a specific warmup period. ''' self.env.random_start_time = False self.env.start_time = 14 * 24 * 3600 self.env.warmup_period = 3 * 3600 obs = self.env.reset() # Check values df = pd.DataFrame(data=[obs], index=['obs_reset_fixed'], columns=['value']) df.index.name = 'keys' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'reset_fixed.csv') self.compare_ref_values_df(df, ref_filepath)
def test_reset_random(self): '''Test that the environment can reset using a random start time that is out of the specified `excluding_periods`. This test also checks that the seed for random initialization works properly. ''' self.env.random_start_time = True self.env.warmup_period = 1 * 3600 # Set the excluding periods to be the two first weeks of February # and the two first weeks of November excluding_periods = [(31 * 24 * 3600, 31 * 24 * 3600 + 14 * 24 * 3600), (304 * 24 * 3600, 304 * 24 * 3600 + 14 * 24 * 3600)] self.env.excluding_periods = excluding_periods random.seed(123456) start_times = OrderedDict() # Reset hundred times for i in range(100): obs = self.env.reset() start_time = self.env.start_time episode = (start_time, start_time + self.env.max_episode_length) for period in excluding_periods: # Make sure that the episodes don't overlap with excluding_periods assert not(episode[0] < period[1] and period[0] < episode[1]),\ 'reset is not working properly when generating random times. '\ 'The episode with starting time {0} and end time {1} '\ 'overlaps with period {2}. This corresponds to the '\ 'generated starting time number {3}.'\ ''.format(start_time,start_time+self.env.max_episode_length,period,i) start_times[start_time] = obs # Check values df = pd.DataFrame.from_dict(start_times, orient='index', columns=['value']) df.index.name = 'keys' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'reset_random.csv') self.compare_ref_values_df(df, ref_filepath)
def train_A2C_with_callback(start_time_tests = [31*24*3600, 304*24*3600], episode_length_test = 14*24*3600, log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents', 'monitored_A2C'), tensorboard_log = os.path.join('results')): '''Method to train an A2C agent using a callback to save the model upon performance improvement. Parameters ---------- start_time_tests : list of integers Time in seconds from the beginning of the year that will be used for testing. These periods should be excluded in the training process. By default the first day of February and the first day of November are used. episode_length_test : integer Number of seconds indicating the length of the testing periods. By default two weeks are reserved for testing. log_dir : string Directory where monitoring data and best trained model are stored. tensorboard_log : path Path to directory to load tensorboard logs. ''' excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append((start_time_test,start_time_test+episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173*24*3600, 266*24*3600)) # Use only one hour episode to have more callbacks env = BoptestGymEnvRewardWeightCost(url = url, actions = ['oveHeaPumY_u'], observations = {'reaTZon_y':(280.,310.)}, random_start_time = True, excluding_periods = excluding_periods, max_episode_length = 1*3600, warmup_period = 3*3600, step_period = 900) env = NormalizedObservationWrapper(env) env = NormalizedActionWrapper(env) os.makedirs(log_dir, exist_ok=True) # Modify the environment to include the callback env = Monitor(env=env, filename=os.path.join(log_dir,'monitor.csv')) # Create the callback: check every 10 steps. We keep it very short for testing callback = SaveOnBestTrainingRewardCallback(check_freq=10, log_dir=log_dir) # Initialize the agent model = A2C('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed, tensorboard_log=tensorboard_log) # Train the agent with callback for saving model.learn(total_timesteps=int(100), callback=callback) return env, model, start_time_tests
def train_A2C_with_variable_episode( start_time_tests=[31 * 24 * 3600, 304 * 24 * 3600], episode_length_test=14 * 24 * 3600, log_dir=os.path.join(utilities.get_root_path(), 'examples', 'agents', 'variable_episode_A2C'), tensorboard_log=os.path.join('results')): '''Method to train an A2C agent using a callback to save the model upon performance improvement. Parameters ---------- start_time_tests : list of integers Time in seconds from the beginning of the year that will be used for testing. These periods should be excluded in the training process. By default the first day of February and the first day of November are used. episode_length_test : integer Number of seconds indicating the length of the testing periods. By default two weeks are reserved for testing. log_dir : string Directory where monitoring data and best trained model are stored. tensorboard_log : path Path to directory to load tensorboard logs. ''' # Define custom child class: class BoptestGymEnvVariableEpisodeLength(BoptestGymEnvRewardWeightCost): '''Boptest gym environment that redefines the reward function to weight more the operational cost when compared with the default reward function. ''' def compute_done(self, res, reward=None, objective_integrand_threshold=0.1): '''Custom method to determine that the episode is done not only when the maximum episode length is exceeded but also when the objective integrand overpasses a certain threshold. The latter is useful to early terminate agent strategies that do not work, hence avoiding unnecessary steps and leading to improved sampling efficiency. Returns ------- done: boolean Boolean indicating whether the episode is done or not. ''' done = (res['time'] >= self.start_time + self.max_episode_length)\ or \ (self.objective_integrand >= objective_integrand_threshold) return done excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append( (start_time_test, start_time_test + episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173 * 24 * 3600, 266 * 24 * 3600)) # Use only six hours as max_episode_length to have more callbacks env = BoptestGymEnvVariableEpisodeLength( url=url, actions=['oveHeaPumY_u'], observations={'reaTZon_y': (280., 310.)}, random_start_time=True, excluding_periods=excluding_periods, max_episode_length=6 * 3600, warmup_period=3 * 3600, step_period=900) env = NormalizedObservationWrapper(env) env = NormalizedActionWrapper(env) os.makedirs(log_dir, exist_ok=True) # Modify the environment to include the callback env = Monitor(env=env, filename=os.path.join(log_dir, 'monitor.csv')) # Create the callback: check every 10 steps. We keep it very short for testing callback = SaveOnBestTrainingRewardCallback(check_freq=10, log_dir=log_dir) # Initialize the agent model = A2C('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed, tensorboard_log=tensorboard_log) # Train the agent with callback for saving model.learn(total_timesteps=int(100), callback=callback) return env, model, start_time_tests
expert_traj : string Path to expert trajectory in .npz format. If not None, the agent will be pretrained using behavior cloning with these data. ''' excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append((start_time_test, start_time_test+episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173*24*3600, 266*24*3600)) # Create a log directory log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents', '{}_{}_{:.0e}_logdir'.format(algorithm,case,training_timesteps)) log_dir = log_dir.replace('+', '') os.makedirs(log_dir, exist_ok=True) # Redefine reward function class BoptestGymEnvCustomReward(BoptestGymEnv): '''Define a custom reward for this building ''' def compute_reward(self): '''Custom reward function ''' # Compute BOPTEST core kpis
random_start_time = True, excluding_periods = excluding_periods, max_episode_length = 1*24*3600, warmup_period = 1*24*3600, step_period = 900) env = NormalizedObservationWrapper(env) env = NormalizedActionWrapper(env) model = A2C('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed, tensorboard_log=tensorboard_log) if not load: model.learn(total_timesteps=int(1e5)) # Save the agent model = A2C.save(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'a2c_{}'.format(case))) else: # Load the trained agent model = A2C.load(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'a2c_{}'.format(case))) return env, model, start_time_tests def test_feb(env, model, start_time_tests, episode_length_test, warmup_period_test, plot=False): ''' Perform test in February ''' observations, actions, rewards, kpis = test_agent(env, model,
render : boolean If true, it renders every episode while training. ''' excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append( (start_time_test, start_time_test + episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173 * 24 * 3600, 266 * 24 * 3600)) # Create a log directory log_dir = os.path.join( utilities.get_root_path(), 'examples', 'agents', '{}_{}_{:.0e}_logdir'.format(algorithm, case, training_timesteps)) log_dir = log_dir.replace('+', '') os.makedirs(log_dir, exist_ok=True) # Redefine reward function class BoptestGymEnvCustomReward(BoptestGymEnv): '''Define a custom reward for this building ''' def compute_reward(self): '''Custom reward function ''' # Compute BOPTEST core kpis