def tensorboard_logger(env): for k, v in env.evaluation_result_list: pos = k.index("-") key = k[:pos - 2] # NB: xgb usually has validation_0, drop the last 2 chars metric = k[pos + 1:] tb_log_value("%s/%s" % (key, metric), v, step=env.iteration)
def logger(self, q, finished): configure("{}/tb".format(self.args.log_path), flush_secs=30) while finished.value < 1: try: (name, value, step) = q.get(block=False) tb_log_value(name, value, step=step) except queue.Empty: pass print("Logging loop closed")
def tensorboard_reporting(metrics, tick, phase, tick_type=None): """This method will write its results to tensorboard :param metrics: A map of metrics to scores :param tick: The time (resolution defined by `tick_type`) :param phase: The phase of training (`Train`, `Valid`, `Test`) :param tick_type: The resolution of tick (`STEP`, `EPOCH`) :return: """ # To use this: # tensorboard --logdir runs # http://localhost:6006 from tensorboard_logger import configure as tb_configure, log_value as tb_log_value global g_tb_run if g_tb_run is None: g_tb_run = 'runs/%d' % os.getpid() print('Creating Tensorboard run %s' % g_tb_run) tb_configure(g_tb_run, flush_secs=5) for metric in metrics.keys(): chart_id = '%s:%s' % (phase, metric) tb_log_value(chart_id, metrics[metric], tick)
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print( "Using good policy {} and adv policy {}".format( arglist.good_policy, arglist.adv_policy ) ) np.seterr(all="raise") # define before your code. # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print("Loading previous state...") U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print("making logger") tb_configure("logs/" + str(arglist.exp_name) + "_" + str(datetime.now())) print("Starting iterations...") while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = episode_step >= arglist.max_episode_len # collect experience for i, agent in enumerate(trainers): agent.experience( obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal ) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[ -1 ] += rew ## / self.n (?) Do we want this to be average across all agents? agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n["n"]) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + ".pkl" print("Finished benchmarking, now saving...") with open(file_name, "wb") as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # log metrics tb_log_value("episode_reward", episode_rewards[train_step - 1], train_step) tb_log_value( "first_agent_reward", agent_rewards[0][train_step - 1], train_step ) tb_log_value( "second_agent_reward", agent_rewards[1][train_step], train_step ) if loss is not None: loss_to_log = loss else: loss_to_log = -100 tb_log_value("loss", loss_to_log, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): print("made it into if terminal and len(episde)") U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate :]), round(time.time() - t_start, 3), ) ) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate :]), [ np.mean(rew[-arglist.save_rate :]) for rew in agent_rewards ], round(time.time() - t_start, 3), ) ) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate :])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate :])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + "_rewards.pkl" with open(rew_file_name, "wb") as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = ( arglist.plots_dir + arglist.exp_name + "_agrewards.pkl" ) with open(agrew_file_name, "wb") as fp: pickle.dump(final_ep_ag_rewards, fp) print("...Finished total of {} episodes.".format(len(episode_rewards))) break
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None, own_log_dir = None, planning_steps = 0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) tb_configure(own_log_dir) steps_in_real_env = 0 if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape # if not planning: # new_obs, reward, done, info = self.env.step(unscaled_action) # else: if not self.num_timesteps % (planning_steps + 1): new_obs, reward, done, info = self.env.step(unscaled_action) #, step_num = self.num_timesteps) steps_in_real_env +=1 else: print("planning step") new_obs, reward, done, info = self.non_vec_env.planning_step(unscaled_action) # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward if not self.num_timesteps % (planning_steps + 1): print("writing real world step to TB") tb_log_value("reward_in_environment", reward_, steps_in_real_env) tb_log_value("reward_planning", reward_, self.num_timesteps) self.num_timesteps += 1 # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self #, ep_reward #, reward_
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None, planning_steps=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) # TODO: use builtin log writer instead of this old lib tb_configure(self.tensorboard_log) action_log_csv = self.tensorboard_log + "_actions.csv" action_log_df = pd.DataFrame(columns=np.concatenate(( ["iteration"], ["p" + str(i) for i in range(24)], ["b" + str(i) for i in range(24)], ["e" + str(i) for i in range(24)], ))) action_log_index = 0 steps_in_real_env = 0 person_data_dict = {} if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape # if not planning: # new_obs, reward, done, info = self.env.step(unscaled_action) # else: if not self.num_timesteps % (planning_steps + 1): ## TODO: work on this? # if self.num_timesteps ==1: # # form the control # from sklearn.preprocessing import MinMaxScaler # grid_price = self.non_vec_env.prices[self.non_vec_env.day - 1] # scaler = MinMaxScaler(feature_range = (0, 10)) # scaled_grid_price = scaler.fit_transform(np.array(grid_price).reshape(-1, 1)) # scaled_grid_price = np.squeeze(scaled_grid_price) # energy_consumptions = self.non_vec_env._simulate_humans(scaled_grid_price) # person_data_dict["control"] = { # "x" : list(range(8, 18)), # "grid_price" : scaled_grid_price, # "energy_consumption" : energy_consumptions["avg"], # "reward" : self.non_vec_env._get_reward(price = grid_price, energy_consumptions = energy_consumptions), # } # # form the data_dict # if self.num_timesteps in [100, 1000, 9500]: # person_data_dict["Step " + str(self.num_timesteps)] = { # "x" : list(range(8, 18)), # "grid_price" : self.non_vec_env.prices[self.non_vec_env.day - 1], # "action" : unscaled_action, # "energy_consumption" : self.non_vec_env.prev_energy, # "reward" : reward, # } # if self.num_timesteps == 9501 and self.people_reaction_log_dir and self.plotter_person_reaction: # # call the plotting statement # self.plotter_person_reaction(person_data_dict, self.people_reaction_log_dir) new_obs, reward, done, info = self.env.step( unscaled_action) #, step_num = self.num_timesteps) steps_in_real_env += 1 else: print("planning step") new_obs, reward, done, info = self.non_vec_env.planning_step( unscaled_action) # write the action to a csv # if ((not self.num_timesteps % 10) & (self.num_timesteps > 10000)) or self.num_timesteps>19500: # ### get the battery charging # battery_op = {} # total_battery_consumption = np.zeros(24) # total_energy_consumption = np.zeros(24) # for prosumer_name in self.non_vec_env.prosumer_dict: # #Get players response to agent's actions # day = self.non_vec_env.day # price = self.non_vec_env.price # prosumer = self.non_vec_env.prosumer_dict[prosumer_name] # prosumer_battery = prosumer.get_battery_operation(day, price) # prosumer_demand = prosumer.get_response(day, price) # total_battery_consumption += prosumer_battery # total_energy_consumption += prosumer_demand # action_log_df.loc[action_log_index] = np.concatenate( # ([self.num_timesteps], # price, # total_battery_consumption, # total_energy_consumption,)) # action_log_index += 1 # action_log_df.to_csv(action_log_csv) # print("Iteration: " + str(self.num_timesteps)) # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward if not self.num_timesteps % (planning_steps + 1): tb_log_value("reward_in_environment", reward_, steps_in_real_env) # tb_log_value("reward_planning", reward_, self.num_timesteps) self.num_timesteps += 1 # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % 100 == 0 and not np.any( unscaled_action == np.inf): if self.action_to_prices_fn: prices = self.action_to_prices_fn(unscaled_action) # tf_util.log_histogram(writer, "action_vec_hist", unscaled_action, self.num_timesteps, bins=10, flush=False) # tb_log_value("constant_load_price", np.sum(prices), self.num_timesteps) # tf_util.log_vec_as_histogram(writer, "prices", prices, self.num_timesteps, flush=True) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self #, ep_reward #, reward_