# RL CONTROLLER #Instantiating the control agent(s) agents = Agent(env, building_info, observations_spaces, actions_spaces) # Select many episodes for training. In the final run we will set this value to 1 (the buildings run for one year) episodes = 10 k, c = 0, 0 cost, cum_reward = {}, {} start = time.time() # The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward) for e in range(episodes): cum_reward[e] = 0 rewards = [] state = env.reset() done = False while not done: if k % (1000) == 0: print('hour: ' + str(k) + ' of ' + str(8760 * episodes)) action = agents.select_action(state) next_state, reward, done, _ = env.step(action) agents.add_to_buffer(state, action, reward, next_state, done) state = next_state cum_reward[e] += reward[0] rewards.append(reward) k += 1 #ddpg
policy_kwargs=policy_kwargs, env=env, batch_size=1024, buffer_size=int(5e5), verbose=0, param_noise=param_noise, action_noise=action_noise, tensorboard_log=parent_dir + "tensorboard/", n_cpu_tf_sess=multiprocessing.cpu_count()) model.learn(total_timesteps=interval * icount, log_interval=interval, tb_log_name="DDPG_{}".format(time.strftime("%Y%m%d")), callback=callbackList) obs = env.reset() dones = False counter = [] while dones == False: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) counter.append(rewards) env.close() print("\nFinal costs:") pp.pprint(env.cost()) # Plot the reward graph if useBestCallback: plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS,
def run(config): data_folder = Path(config.data_path) building_attributes = data_folder / 'building_attributes.json' solar_profile = data_folder / 'solar_generation_1kW.csv' building_state_actions = 'buildings_state_action_space.json' # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)] config.num_buildings = 6 # customized log directory hidden = config.hidden_dim lr = config.lr tau = config.tau gamma = config.gamma batch_size = config.batch_size buffer_length = config.buffer_length to_print = lambda x: str(x) log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\ "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/" logger = SummaryWriter(log_dir=log_path) # TODO fix here building_ids = ["Building_" + str(i) for i in [1, 2, 5, 6, 7, 8]] #[1,2,5,6,7,8] env = CityLearn(building_attributes, solar_profile, building_ids, buildings_states_actions=building_state_actions, cost_function=[ 'ramping', '1-load_factor', 'peak_to_valley_ratio', 'peak_demand', 'net_electricity_consumption' ]) observations_spaces, actions_spaces = env.get_state_action_spaces() # Instantiating the control agent(s) if config.agent_alg == 'MADDPG': agents = MA_DDPG(observations_spaces, actions_spaces, hyper_params=vars(config)) else: raise NotImplementedError k, c = 0, 0 cost, cum_reward = {}, {} buffer = ReplayBuffer(max_steps=config.buffer_length, num_agents=config.num_buildings, obs_dims=[s.shape[0] for s in observations_spaces], ac_dims=[a.shape[0] for a in actions_spaces]) # TODO: store np or tensor in buffer? start = time.time() for e in range(config.n_episodes): cum_reward[e] = 0 rewards = [] state = env.reset() statecast = lambda x: [torch.FloatTensor(s) for s in x] done = False ss = 0 while not done: if k % (40000 * 4) == 0: print('hour: ' + str(k) + ' of ' + str(TIME_PERIOD * config.n_episodes)) action = agents.select_action(statecast(state), explore=False) action = [a.detach().numpy() for a in action] # if batch norm: action = [np.squeeze(a, axis=0) for a in action] ss += 1 #print("action is ", action) #print(action[0].shape) #raise NotImplementedError next_state, reward, done, _ = env.step(action) reward = reward_function( reward) # See comments in reward_function.py #buffer_reward = [-r for r in reward] # agents.add_to_buffer() buffer.push(statecast(state), action, reward, statecast(next_state), done) # if (len(buffer) >= config.batch_size and # (e % config.steps_per_update) < config.n_rollout_threads): if len(buffer) >= config.batch_size: if USE_CUDA: agents.to_train(device='gpu') else: agents.to_train(device='cpu') for a_i in range(agents.n_buildings): sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA) agents.update(sample, a_i, logger=logger, global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='net electric consumption', scalar_value=env.net_electric_consumption[-1], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='env cost total', scalar_value=env.cost()['total'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="1 load factor", scalar_value=env.cost()['1-load_factor'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak to valley ratio", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak demand", scalar_value=env.cost()['peak_demand'], global_step=e * TIME_PERIOD + ss) logger.add_scalar( tag="net energy consumption", scalar_value=env.cost()['net_electricity_consumption'], global_step=e * TIME_PERIOD + ss) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e * TIME_PERIOD + ss) for id, r in enumerate(reward): logger.add_scalar(tag="agent {} reward ".format(id), scalar_value=r, global_step=e * TIME_PERIOD + ss) state = next_state cum_reward[e] += reward[0] k += 1 cur_time = time.time() # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k)) cost[e] = env.cost() if c % 1 == 0: print(cost[e]) # add env total cost and reward logger logger.add_scalar(tag='env cost total final', scalar_value=env.cost()['total'], global_step=e) logger.add_scalar(tag="1 load factor final", scalar_value=env.cost()['1-load_factor'], global_step=e) logger.add_scalar(tag="peak to valley ratio final", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e) logger.add_scalar(tag="peak demand final", scalar_value=env.cost()['peak_demand'], global_step=e) logger.add_scalar( tag="net energy consumption final", scalar_value=env.cost()['net_electricity_consumption'], global_step=e) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e) c += 1 rewards.append(reward) end = time.time() print((end - start) / 60.0)
# Get number of agents in Environment num_agents = env.n_buildings print('\nNumber of Agents: ', num_agents) # Set the size of state observations or state size print('\nSize of State: ', observations_spaces) """ ############################################# STEP 3: Run the RBC Controller to extract baseline costs """ # Instantiating the control agent(s) agent = RBC_Agent(actions_spaces) state = env.reset() done = False rewards_list = [] while not done: action = agent.select_action(state) next_state, reward, done, _ = env.step(action) state = next_state rewards_list.append(reward) cost_rbc = env.get_baseline_cost() print(cost_rbc) """ ################################### STEP 4: Create DDPG Agents from the Agent Class in ddpg_agent.py A DDPG agent initialized with the following parameters. ====== building_info: Dictionary with building information as described above
# Create the final dir final_dir = parent_dir + "final/" os.makedirs(final_dir, exist_ok=True) # Tensorboard writer object writer = SummaryWriter(log_dir=parent_dir + 'tensorboard/') print("Logging to {}\n".format(parent_dir + 'tensorboard/')) # Set seeds (TO DO: CHECK PERFORMANCE SAME FOR TWO RUNS WITH SAME SEED) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Get the Rule Base Controller baseline actions RBC_agent = RBC_Agent(actions_spacesRBC) state = RBC_env.reset() state_list = [] action_list = [] doneRBC = False while not doneRBC: action_RBC = RBC_agent.select_action([ list(RBC_env.buildings.values())[0].sim_results['hour'][ RBC_env.time_step] ]) action_list.append(action_RBC) state_list.append(state) next_stateRBC, rewardsRBC, doneRBC, _ = RBC_env.step(action_RBC) state = next_stateRBC RBC_action_base = np.array(action_list) RBC_state_base = np.array(state_list)