def __init__(self, env, learning_rate, buffer_size, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm): self.env = env self.lr = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.n_epochs = n_epochs self.gamma = gamma self.gae_lam = gae_lam self.clip_range = clip_range self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.num_timesteps = 0 self.ep_info_buffer = deque(maxlen=100) self._n_updates = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') if isinstance(env, VecEnv): self.num_envs = env.num_envs self.rms_obs = RunningMeanStd(shape=(1, 1, 84, 84)) self.rms_rew = RunningMeanStd() self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') logger.configure('./logs')
def test_main(tmp_path): """ tests for the logger module """ logger = configure(None, ["stdout"]) logger.info("hi") logger.debug("shouldn't appear") assert logger.level == INFO logger.set_level(DEBUG) assert logger.level == DEBUG logger.debug("should appear") logger = configure(folder=str(tmp_path)) assert logger.dir == str(tmp_path) logger.record("a", 3) logger.record("b", 2.5) logger.dump() logger.record("b", -2.5) logger.record("a", 5.5) logger.dump() logger.info("^^^ should see a = 5.5") logger.record("f", "this text \n \r should appear in one line") logger.dump() logger.info( '^^^ should see f = "this text \n \r should appear in one line"') logger.record_mean("b", -22.5) logger.record_mean("b", -44.4) logger.record("a", 5.5) logger.dump() logger.record("a", "longasslongasslongasslongasslongasslongassvalue") logger.dump() logger.warn("hey") logger.error("oh")
def test_main(tmp_path): """ tests for the logger module """ info("hi") debug("shouldn't appear") set_level(DEBUG) debug("should appear") configure(folder=str(tmp_path)) record("a", 3) record("b", 2.5) dump() record("b", -2.5) record("a", 5.5) dump() info("^^^ should see a = 5.5") record_mean("b", -22.5) record_mean("b", -44.4) record("a", 5.5) dump() with ScopedConfigure(None, None): info("^^^ should see b = 33.3") with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]): record("b", -2.5) dump() reset() record("a", "longasslongasslongasslongasslongasslongassvalue") dump() warn("hey") error("oh") record_dict({"test": 1})
def train(output_folder, load_path): base_output = Path(output_folder) full_output = base_output / datetime.datetime.now().isoformat( timespec="seconds") # latest = base_output / "latest" # latest.symlink_to(full_output) logger.configure(folder=str(full_output)) env = LoveLetterMultiAgentEnv(num_players=4, reward_fn=Rewards.fast_elimination_reward) env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) if load_path: model = PPO.load(load_path, env) else: # def test_fn(env): # return env.valid_action_mask() # model = PPO(MlpPolicy, env, verbose=1, ent_coef=0.05) #, action_mask_fn=test_fn) other_agents = [RandomAgent(env, SEED + i) for i in range(3)] # other_agents = [ # PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env), # ] # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # ] agents = [model, *other_agents] env.set_agents(agents) eval_callback = EvalCallback( env, best_model_save_path=str(full_output), log_path=str(full_output), eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, ) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(str(full_output / "final_model")) env.close()
def test_main(tmp_path): """ tests for the logger module """ info("hi") debug("shouldn't appear") assert get_level() == INFO set_level(DEBUG) assert get_level() == DEBUG debug("should appear") configure(folder=str(tmp_path)) assert get_dir() == str(tmp_path) record("a", 3) record("b", 2.5) dump() record("b", -2.5) record("a", 5.5) dump() info("^^^ should see a = 5.5") record("f", "this text \n \r should appear in one line") dump() info('^^^ should see f = "this text \n \r should appear in one line"') record_mean("b", -22.5) record_mean("b", -44.4) record("a", 5.5) dump() with ScopedConfigure(None, None): info("^^^ should see b = 33.3") with ScopedConfigure(str(tmp_path / "test-logger"), ["json"]): record("b", -2.5) dump() reset() record("a", "longasslongasslongasslongasslongasslongassvalue") dump() warn("hey") error("oh") record_dict({"test": 1}) assert isinstance(get_log_dict(), dict) and set(get_log_dict().keys()) == {"test"}
def configure_logger( verbose: int = 0, tensorboard_log: Optional[str] = None, tb_log_name: str = "", reset_num_timesteps: bool = True, ) -> None: """ Configure the logger's outputs. :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param tensorboard_log: the log location for tensorboard (if None, no logging) :param tb_log_name: tensorboard log """ if tensorboard_log is not None and SummaryWriter is not None: latest_run_id = get_latest_run_id(tensorboard_log, tb_log_name) if not reset_num_timesteps: # Continue training in the same directory latest_run_id -= 1 save_path = os.path.join(tensorboard_log, f"{tb_log_name}_{latest_run_id + 1}") if verbose >= 1: logger.configure(save_path, ["stdout", "tensorboard"]) else: logger.configure(save_path, ["tensorboard"]) elif verbose == 0: logger.configure(format_strings=[""])
def test_set_logger(tmp_path): # set up logger new_logger = configure(str(tmp_path), ["stdout", "csv", "tensorboard"]) # Default outputs with verbose=0 model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4) assert model.logger.output_formats == [] model = A2C("MlpPolicy", "CartPole-v1", verbose=0, tensorboard_log=str(tmp_path)).learn(4) assert str(tmp_path) in model.logger.dir assert isinstance(model.logger.output_formats[0], TensorBoardOutputFormat) # Check that env variable work new_tmp_path = str(tmp_path / "new_tmp") os.environ["SB3_LOGDIR"] = new_tmp_path model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4) assert model.logger.dir == new_tmp_path # Default outputs with verbose=1 model = A2C("MlpPolicy", "CartPole-v1", verbose=1).learn(4) assert isinstance(model.logger.output_formats[0], HumanOutputFormat) # with tensorboard model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log=str(tmp_path)).learn(4) assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 2 model.learn(32) # set new logger model.set_logger(new_logger) # Check that the new logger is correctly setup assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], CSVOutputFormat) assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 3 model.learn(32) model = A2C("MlpPolicy", "CartPole-v1", verbose=1) model.set_logger(new_logger) model.learn(32) # Check that the new logger is not overwritten assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], CSVOutputFormat) assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 3
def test_main(): """ tests for the logger module """ info("hi") debug("shouldn't appear") set_level(DEBUG) debug("should appear") folder = "/tmp/testlogging" if os.path.exists(folder): shutil.rmtree(folder) configure(folder=folder) logkv("a", 3) logkv("b", 2.5) dumpkvs() logkv("b", -2.5) logkv("a", 5.5) dumpkvs() info("^^^ should see a = 5.5") logkv_mean("b", -22.5) logkv_mean("b", -44.4) logkv("a", 5.5) dumpkvs() with ScopedConfigure(None, None): info("^^^ should see b = 33.3") with ScopedConfigure("/tmp/test-logger/", ["json"]): logkv("b", -2.5) dumpkvs() reset() logkv("a", "longasslongasslongasslongasslongasslongassvalue") dumpkvs() warn("hey") error("oh") logkvs({"test": 1})
def configure_logger( verbose: int = 0, tensorboard_log: Optional[str] = None, tb_log_name: str = "", reset_num_timesteps: bool = True, ) -> Logger: """ Configure the logger's outputs. :param verbose: the verbosity level: 0 no output, 1 info, 2 debug :param tensorboard_log: the log location for tensorboard (if None, no logging) :param tb_log_name: tensorboard log :param reset_num_timesteps: Whether the ``num_timesteps`` attribute is reset or not. It allows to continue a previous learning curve (``reset_num_timesteps=False``) or start from t=0 (``reset_num_timesteps=True``, the default). :return: The logger object """ save_path, format_strings = None, ["stdout"] if tensorboard_log is not None and SummaryWriter is None: raise ImportError( "Trying to log data to tensorboard but tensorboard is not installed." ) if tensorboard_log is not None and SummaryWriter is not None: latest_run_id = get_latest_run_id(tensorboard_log, tb_log_name) if not reset_num_timesteps: # Continue training in the same directory latest_run_id -= 1 save_path = os.path.join(tensorboard_log, f"{tb_log_name}_{latest_run_id + 1}") if verbose >= 1: format_strings = ["stdout", "tensorboard"] else: format_strings = ["tensorboard"] elif verbose == 0: format_strings = [""] return configure(save_path, format_strings=format_strings)
eval_env, best_model_save_path='best_model/' + name, log_path='best_model/' + name + '/', eval_freq=n_timesteps_episode * args.eval_freq, deterministic=True, render=False, n_eval_episodes=args.eval_length) callbacks.append(eval_callback) # Set up tensorboard logger if args.tensorboard: log_callback = LoggerCallback(sinergym_logger=bool(args.logger)) callbacks.append(log_callback) # lets change default dir for TensorboardFormatLogger only tb_path = args.tensorboard + '/' + name new_logger = configure(tb_path, ["tensorboard"]) model.set_logger(new_logger) callback = CallbackList(callbacks) # ---------------------------------------------------------------------------- # # TRAINING # # ---------------------------------------------------------------------------- # model.learn(total_timesteps=timesteps, callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # If the algorithm doesn't reset or close the environment, this script will do it in # order to correctly log all the simulation data (Energyplus + Sinergym # logs)
# Train single CPU PPO1 on slimevolley. # Should solve it (beat existing AI on average over 1000 trials) in 3 hours on single CPU, within 3M steps. import os import click from stable_baselines3.common import logger from stable_baselines3.common.callbacks import EvalCallback from stable_baselines3.ppo import MlpPolicy, PPO from gym_love_letter.agents import RandomAgent from gym_love_letter.envs.base import LoveLetterMultiAgentEnv LOGDIR = "ppo" # moved to zoo afterwards. logger.configure(folder=LOGDIR) SEED = 721 # NUM_TIMESTEPS = int(2e7) # EVAL_FREQ = 250000 # EVAL_EPISODES = 1000 NUM_TIMESTEPS = 300000 EVAL_FREQ = 5000 EVAL_EPISODES = 50 @click.command() @click.option("--load", "-l", "load_path") def train(load_path): env = LoveLetterMultiAgentEnv(num_players=4)
def train(env_id, num_timesteps, seed): # sess = util.single_threaded_session() # sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() # Create a new base directory like //home/marco/Reinforcement_Learning/Logs/openai-2018-05-21-12-27 log_dir = os.path.join( energyplus_logbase_dir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M")) if not os.path.exists(log_dir + '/output'): os.makedirs(log_dir + '/output') os.environ["ENERGYPLUS_LOG"] = log_dir model = os.getenv('ENERGYPLUS_MODEL') if model is None: print('Environment variable ENERGYPLUS_MODEL is not defined') exit() weather = os.getenv('ENERGYPLUS_WEATHER') if weather is None: print('Environment variable ENERGYPLUS_WEATHER is not defined') exit() # MPI is to parallelize training # Logs the training in a file log.txt in the given directory rank = MPI.COMM_WORLD.Get_rank() if rank == 0: print('train: init logger with dir={}'.format(log_dir)) # XXX logger.configure(log_dir) else: logger.configure(format_strings=[]) logger.set_level(logger.DISABLED) # Make Gym environment: env = make_energyplus_env(env_id, workerseed) ###### EXPERIMENTS FROM FIRST PAPER: ########################################### # # trpo_mpi.learn(env, policy_fn, # max_timesteps=num_timesteps, # timesteps_per_batch=16*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, # gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) # Apply TRPO algorithm from OpenAI baselines: # action_noise = NormalActionNoise(mean=np.zeros(4), sigma=0.1 * np.ones(4)) # # policy_kwargs_tqc = dict(n_critics=2, n_quantiles=25) # model_tqc = TQC("MlpPolicy", env, top_quantiles_to_drop_per_net=2 # , verbose=1, policy_kwargs=policy_kwargs_tqc) # # model_ppo = PPO('MlpPolicy', env, verbose=1, n_steps=4096, batch_size=64, n_epochs=15) # model_td3 = TD3('MlpPolicy', env, verbose=1, action_noise=action_noise) # model_sac = SAC('MlpPolicy', env, verbose=1) # model_ppolstm = PPO2(MlpLstmPolicy, env, verbose=1,n_steps=27, nminibatches=1) # # # Change the algorithm here: # # model_ppolstm.learn(total_timesteps=num_timesteps, log_interval=1, reset_num_timesteps=False) # # model_ppo.learning_rate = 0 # # model_ppo.learn(total_timesteps=35040, reset_num_timesteps=False) # #####################################EXPERIMENTS 2: ################################### sac_v2_lstm(env, num_timesteps, train=True, test=False) #slac(env, num_timesteps) env.close()
if __name__ == '__main__': # this is required due to forking processes run_id = str(uuid.uuid4()) # ALL running environments must share this print(f"RUN ID: {run_id}") # to pass launch args, add to env_kwargs: 'launch_args': ['render:=false', 'plot_log:=true'] env = make_vec_env(RocketLeagueInterface, env_kwargs={'run_id': run_id}, n_envs=24, vec_env_cls=SubprocVecEnv) model = PPO("MlpPolicy", env) # log training progress as CSV log_dir = expanduser(f'~/catkin_ws/data/rocket_league/{run_id}') logger = configure(log_dir, ["stdout", "csv", "log"]) model.set_logger(logger) # log model weights freq = 20833 # save 20 times # freq = steps / (n_saves * n_envs) callback = CheckpointCallback(save_freq=freq, save_path=log_dir) # run training steps = 240000000 # 240M (10M sequential) print(f"training on {steps} steps") model.learn(total_timesteps=steps, callback=callback) # save final weights print("done training") model.save(log_dir + "/final_weights")
env_vec, best_model_save_path='best_model/' + name + '/', log_path='best_model/' + name + '/', eval_freq=n_timesteps_episode * args.eval_freq, deterministic=True, render=False, n_eval_episodes=args.eval_length) callbacks.append(eval_callback) # Set up tensorboard logger if args.tensorboard: log_callback = LoggerCallback(sinergym_logger=bool(args.logger)) callbacks.append(log_callback) # lets change default dir for TensorboardFormatLogger only tb_path = args.tensorboard + '/' + name new_logger = configure(tb_path, ["tensorboard,stdout"]) model.set_logger(new_logger) callback = CallbackList(callbacks) # Training model.learn( total_timesteps=timesteps, callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # End mlflow run mlflow.end_run()
"rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in a2c.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - a2c.start_time), exclude="tensorboard") logger.record("time/total_timesteps", a2c.num_timesteps, exclude="tensorboard") logger.dump(step=a2c.num_timesteps) a2c.train() # obs = cat.reset() # print("obs_shape") # print(obs.shape) # for i in range(10000): # obs,rew,done,info = cat.step([0]*cat.num_envs) # print(obs.shape) # print(rew.shape) configure("log", format_strings='stdout,log,csv'.split(',')) #with ScopedConfigure("log", format_strings='stdout,log,csv'.split(',')): main() #res = ProcConcatVec([env_contr]) # env = aec_to_markov(env) # env.reset() # env.step([0]*len(env.agents))
import random import time from copy import deepcopy import gym import matplotlib import numpy as np import pandas as pd from gym import spaces from stable_baselines3.common.logger import configure from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv matplotlib.use("Agg") # tf_logger = configure('/home/l/code/python/log/run', format_strings=['tensorboard']) tf_logger = configure('/home/l/code/python/log/env', format_strings=['log']) logger = configure('/home/l/code/python/log') class StockTradingEnvCashpenalty(gym.Env): """ A stock trading environment for OpenAI gym This environment penalizes the model for not maintaining a reserve of cash. This enables the model to manage cash reserves in addition to performing trading procedures. Reward at any step is given as follows r_i = (sum(cash, asset_value) - initial_cash - max(0, sum(cash, asset_value)*cash_penalty_proportion-cash))/(days_elapsed) This reward function takes into account a liquidity requirement, as well as long-term accrued rewards. Parameters: df (pandas.DataFrame): Dataframe containing data buy_cost_pct (float): cost for buying shares sell_cost_pct (float): cost for selling shares hmax (int, array): maximum cash to be traded in each trade per asset. If an array is provided, then each index correspond to each asset