def basic_usage_example(): # Basic Usage: Training, Saving, Loading. # Create environment. env = gym.make("LunarLander-v2") # Instantiate the agent. model = DQN("MlpPolicy", env, verbose=1) # Train the agent. model.learn(total_timesteps=int(2e5)) # Save the agent. model.save("dqn_lunar") del model # Delete trained model to demonstrate loading. # Load the trained agent. # NOTE: if you have loading issue, you can pass 'print_system_info=True' # to compare the system on which the model was trained vs the current one. #model = DQN.load("dqn_lunar", env=env, print_system_info=True) model = DQN.load("dqn_lunar", env=env) # Evaluate the agent. # NOTE: If you use wrappers with your environment that modify rewards, # this will be reflected here. To evaluate with original rewards, # wrap environment in a "Monitor" wrapper before other wrappers. mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Enjoy trained agent. obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render()
def train(time_steps, save=False, **params): verbose = params.get('verbose', 1) buffer_size = params.get('buffer_size', 10000) learning_starts = params.get('learning_starts', 1024) env = DQNAgent.create_env(1) model = DQN('CnnPolicy', env, verbose=verbose, buffer_size=buffer_size, learning_starts=learning_starts, tensorboard_log=TB_LOGS) model.learn(time_steps) if save: model.save(MODEL_PATH)
def ai_playing(): env = Snake_Env(server=False) # env = make_vec_env(lambda: env, n_envs=4, monitor_dir="./vec") env = Monitor(env, "1e7_bw_dqn") obs = env.reset() model = DQN("CnnPolicy", env, verbose=1, optimize_memory_usage=True, buffer_size=500000) model.learn(total_timesteps=1e7) model.save("1e7_bw_dqn")
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_dqn_{itr}") obs = env.reset() model = DQN( "CnnPolicy", env, verbose = 1, optimize_memory_usage = True, buffer_size = 500000, learning_rate = 1e-5, tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"dqn_flappy_{itr}")
def init_and_train_rl_classification_model( timesteps, path='data/rl_rps.pth', save=True, n=2000): dm, y_oracle = init_dm(CONFIG) env = ClassificationEnv(dm, y_oracle) # env = MonitorWrapper(env, autolog=True) model = DQN(CnnPolicy, env, verbose=1) idxs = list(range(n)) dm.label_samples(idxs, y_oracle[idxs]) model.learn(total_timesteps=timesteps) if save: model.save(path) env.enable_evaluating(True) evaluate(model, env) env.enable_evaluating(False) return model
def train_dqn(): log_dir = f"model_save/" env = ENV_DISCRETE(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DQN('MlpPolicy', env, verbose=1, batch_size=2048, seed=1) callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir) model.learn(total_timesteps=int(100000), callback=callback, log_interval=100) model.save('model_save/dqn')
def train_dqn_growpsace(save_model=False): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) env = gym.make(config.env_name) model = DQN("CnnPolicy", env, verbose=1, gradient_steps=20, optimize_memory_usage=True) model.learn(total_timesteps=config.num_updates, log_interval=1, callback=WandbStableBaselines3Callback()) if save_model: model.save(f"dqn_{config.env_name}")
def train(env, type, timesteps): env.reset() print(check_env(env)) env = FlattenObservation(env) print(env.reward_range) print(env.action_space) if type == "DQN": model = DQN('MlpPolicy', exploration_fraction=0.999, env=env, verbose=1) elif type == "A2C": model = A2C('MlpPolicy', env=env, verbose=1) elif type == "PPO": model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups")
def train(params): model = DQN(params.get("policy"), env, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), tensorboard_log=log_dir, gamma=params.get("gamma"), target_update_interval=params.get("target_update_interval"), train_freq=params.get("train_freq"), gradient_steps=params.get("gradient_steps"), exploration_fraction=params.get("exploration_fraction"), exploration_final_eps=params.get("exploration_final_eps"), learning_starts=params.get("learning_starts"), batch_size=params.get("batch_size"), policy_kwargs=policy_kwargs) # Train for 1e5 steps model.learn(total_timesteps=params.get("train_steps")) # Save the trained agent model.save(exp_name)
save_freq=5000, save_path= './checkpoint/v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps/', name_prefix='dqn_policy') time_steps = 100000 model.learn( total_timesteps=int(time_steps), log_interval=5, tb_log_name="v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps", callback=checkpoint_callback, ) # Save policy weights # model.save("model/dqn_airsim_drone_policy") model.save("model/v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps") # time_steps = 100000 # model = DQN( # "MultiInputPolicy", # env, # learning_rate=0.00025, # verbose=1, # batch_size=32, # train_freq=4, # target_update_interval=200, # learning_starts=200, # buffer_size=10000, # max_grad_norm=10, # exploration_fraction=0.1, # exploration_final_eps=0.01,
import sys from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.dqn.policies import MlpPolicy from stable_baselines3 import DQN from gym_sudoku.envs.sudoku_env import SudokuEnv env = SudokuEnv() if "--train" in sys.argv: model = DQN(MlpPolicy, env, verbose=1, learning_starts=100) model.learn(total_timesteps=10000) model.save("dqn_sudoku") else: model = DQN.load("dqn_sudoku") obs = env.reset() env.render() for _ in range(20): action, _states = model.predict(obs, deterministic=True) print("Action", action) print("States", _states) print("Coordinates", env.fill_pointer) obs, rewards, done, info = env.step(action) env.render() if done: print("Resetting ==============================================>") obs = env.reset()
exploration_fraction=0.1, exploration_final_eps=0.01, device="cuda", tensorboard_log="./tb_logs/", ) # Create an evaluation callback with the same env, called every 10000 iterations callbacks = [] eval_callback = EvalCallback( env, callback_on_new_best=None, n_eval_episodes=5, best_model_save_path=".", log_path=".", eval_freq=10000, ) callbacks.append(eval_callback) kwargs = {} kwargs["callback"] = callbacks # Train for a certain number of timesteps model.learn( total_timesteps=5e5, tb_log_name="dqn_airsim_drone_run_" + str(time.time()), **kwargs ) # Save policy weights model.save("dqn_airsim_drone_policy")
if(args.dqn): args.name = 'DQN_' + args.name model = DQN('MlpPolicy', gym.make('Trading-v2'), verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') else: model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') model.learn(total_timesteps = 20e6, tb_log_name = args.name, callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", name_prefix = args.name)) model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo')) else: print('Loading agent') if(args.dqn): model = DQN.load('dqn_trading_sb') else: model = PPO.load('ppo_trading_sb') # model = PPO('MlpPolicy', env, verbose = 1) eval_eps = 100 pbar = tqdm(total = eval_eps) env = gym.make('Trading-v0') rewards = [] baseline_diff = [] for ep in range(eval_eps):
MODEL_PATH = MODELS_DIR_PATH / MODEL_NAME LOG_PATH = MODELS_DIR_PATH / 'logs/' CHECKPOINT_PATH = MODELS_DIR_PATH / 'checkpoints/' MONITOR_PATH = MODELS_DIR_PATH / 'monitoring/' config = { 'simulation_frequency': 15, 'policy_frequency': 0.5, 'demand_amplitude': 15000, 'total_steps': 100, } env = gym.make('highway-v0', **config) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=CHECKPOINT_PATH, name_prefix=MODEL_NAME, verbose=1) env = Monitor(env, MONITOR_PATH) model = DQN(MlpPolicy, env, verbose=1, tensorboard_log=LOG_PATH, learning_starts=100, target_update_interval=500) model.learn(total_timesteps=TRAINING_STEPS, callback=checkpoint_callback, log_interval=4) model.save(MODEL_PATH)
import numpy as np import gym import gym_fishing from stable_baselines3 import DQN from stable_baselines3.common.env_checker import check_env env = gym.make('fishing-v0') check_env(env) model = DQN('MlpPolicy', env, verbose=1) model.learn(total_timesteps=200) ## Simulate a run with the trained model, visualize result df = env.simulate(model) env.plot(df, "dqn.png") ## Evaluate model from stable_baselines3.common.evaluation import evaluate_policy mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5) print("mean reward:", mean_reward, "std:", std_reward) ## Save and reload the model model.save("dqn") model = DQN.load("dqn")
checkpoint_callback = CheckpointCallback( save_freq=500, save_path='./checkpoint/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps/', name_prefix='dqn_policy') time_steps = 100000 model.learn( total_timesteps=int(time_steps), log_interval=5, tb_log_name="v18_dqn_cnnPolicy_4actions_imageObs_100000_steps", callback=checkpoint_callback, ) # Save policy weights # model.save("model/dqn_airsim_drone_policy") model.save("model/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps") # time_steps = 100 # model = DQN( # "CnnPolicy", # env, # learning_rate=0.00025, # verbose=1, # batch_size=32, # train_freq=4, # target_update_interval=10000, # learning_starts=10000, # buffer_size=50000, # max_grad_norm=10, # exploration_fraction=0.1, # exploration_final_eps=0.01,
log_callback = LoggerCallback(sinergym_logger=bool(args.logger)) callbacks.append(log_callback) # lets change default dir for TensorboardFormatLogger only tb_path = args.tensorboard + '/' + name new_logger = configure(tb_path, ["tensorboard"]) model.set_logger(new_logger) callback = CallbackList(callbacks) # ---------------------------------------------------------------------------- # # TRAINING # # ---------------------------------------------------------------------------- # model.learn(total_timesteps=timesteps, callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # If the algorithm doesn't reset or close the environment, this script will do it in # order to correctly log all the simulation data (Energyplus + Sinergym # logs) if env.simulator._episode_existed: env.close() # ---------------------------------------------------------------------------- # # Mlflow artifacts storege # # ---------------------------------------------------------------------------- # if args.mlflow_store: # Code for send output and tensorboard to mlflow artifacts. mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent, artifact_path=name) if args.evaluation:
buffer_size=500000, max_grad_norm=10, exploration_fraction=0.1, exploration_final_eps=0.01, device="cuda", tensorboard_log="./tb_logs/", ) # Create an evaluation callback with the same env, called every 10000 iterations callbacks = [] eval_callback = EvalCallback( env, callback_on_new_best=None, n_eval_episodes=5, best_model_save_path=".", log_path=".", eval_freq=10000, ) callbacks.append(eval_callback) kwargs = {} kwargs["callback"] = callbacks # Train for a certain number of timesteps model.learn(total_timesteps=5e5, tb_log_name="dqn_airsim_car_run_" + str(time.time()), **kwargs) # Save policy weights model.save("dqn_airsim_car_policy")
print(f"Load agent from {agentPath}") # model = PPO.load(agentPath) model = DQN.load(agentPath) model.set_env(env) else: print(f"Instanciate new agent and save in {agentPath}") # model = PPO("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1) # model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1) model = DQN("CnnPolicy", env, target_update_interval=1000, batch_size=512, exploration_final_eps=0.2, policy_kwargs=policy_kwargs, verbose=1) model.save(agentPath) # Record gif of trained agent imagesGrid = [] obs = env.reset() imagesGrid.append(env.render("human")) for step in range(200): action, _ = model.predict(obs, deterministic=False) obs, reward, done, info = env.step(action) print("reward : ", reward) env.render(mode='console') imagesGrid.append(env.render("human")) if done: print("Goal reached!", "reward=", reward) break imagesGrid[0].save(f'_data/visu.gif',
max_steps=100 dqn_model.learn(total_timesteps=max_steps) # Commented out IPython magic to ensure Python compatibility. # %tensorboard --logdir {LOG_DIR} """## Εκτίμηση απόδοσης""" from stable_baselines3.common.evaluation import evaluate_policy mean_reward, std_reward = evaluate_policy(dqn_model, test_env, n_eval_episodes=10) print(f"Eval reward: {mean_reward} (+/-{std_reward})") """## Σώσιμο εκπαιδευμένου μοντέλου""" dqn_model.save("dqn_pong") """Το μοντέλο θα αποθηκευθεί ως zip και μπορείτε να το κατεβάσετε τοπικά από το αριστερό sidebar του Colab στο "Files" και μετά στο ellipsis menu πάνω στο filename. ## Φόρτωση εκπαιδευμένου μοντέλου Από το αριστερό sidebar του Colab και το "Files" ανεβάστε το αρχείο zip του εκπαιδευμένου μοντέλου. Εδώ θα ανεβάσουμε ένα μοντέλο Α2C που έχουμε εκπαιδεύσει νωρίτερα. Μπορείτε να το κατεβάσετε από [εδώ](https://drive.google.com/uc?export=download&id=1COsaNOH8SjbpxxIYc5lOF-QUiUJGU5ZB). Αν χρειαστεί μετονομάστε το αρχείο σε a2c_pong.zip """ from stable_baselines3 import A2C # !wget --no-check-certificate https://www.dropbox.com/s/zm02848gzbx3jsl/a2c_pong.zip?dl=1 -O a2c_berzerk.zip # a2c_model = A2C.load("a2c_berzerk.zip", verbose=10)
}, "policy_frequency": 2, "duration": 40, }) env.reset() model = DQN('CnnPolicy', env, gamma=0.8, learning_rate=5e-4, buffer_size=40*1000, learning_starts=200, exploration_fraction=0.6, target_update_interval=256, batch_size=32, verbose=1, tensorboard_log="logs/") model.learn(total_timesteps=int(2e5)) model.save("dqn_highway") # Record video model = DQN.load("dqn_highway") env.configure({"policy_frequency": 15, "duration": 20 * 15}) video_length = 2 * env.config["duration"] env = VecVideoRecorder(env, "videos/", record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix="dqn-agent") obs = env.reset() for _ in range(video_length + 1): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) env.close()
import gym from stable_baselines3 import DQN from pfrl.wrappers import atari_wrappers ap = argparse.ArgumentParser(description="DQN") ap.add_argument("--env", default="PongNoFrameskip-v4") ap.add_argument("--frame_stacks", default=4) ap.add_argument("--learning_starts", default=100000, type=int) ap.add_argument("--total_timesteps", default=1000000, type=int) ap.add_argument("--save_path") ap.add_argument("--tensorboard_log") ns = ap.parse_args() env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(ns.env, max_frames=10000), episode_life=True, clip_rewards=True, ) model = DQN('CnnPolicy', env, verbose=1, buffer_size=10000, learning_rate=.0001, learning_starts=ns.learning_starts, target_update_interval=1000, tensorboard_log=ns.tensorboard_log) model.learn(total_timesteps=ns.total_timesteps) if ns.save_path is not None: model.save(ns.save_path)
profiler = Profiler() profiler.start() # Setup Environment print('Envrionment Setup...') env = gym.make('Desktop-v0', debug=False, show=True, steplimit=100) outdir = '/tmp/random-agent-results' env = Monitor(env, directory=outdir, force=True) episodes = 10 # Setup Agent print('Agent Setup...') model = DQN(MlpPolicy, env, verbose=0, buffer_size=500) print('Returning Trained Model...') model.learn(total_timesteps=1000, log_interval=4) print('Saving Trained Model...') model.save("deepq_desktop") del model # remove to demonstrate saving and loading print('Loading Trained Model...') model = DQN.load("deepq_desktop") def unique_reward(last_state, current_state): # rewards a current state that is different from the last state return (np.sum(last_state) - np.sum(current_state)) if __name__ == '__main__': try: print('Running Environment') last_state = None
class TradingAgent: def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs): # wrapper around stable_baselines RL implemenetations assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS) if model == 'a2c': self.rl = A2C(**kwargs) elif model == 'ppo': self.rl = PPO(**kwargs) elif model == 'dqn': self.rl = DQN(**kwargs) elif model == 'td3': self.rl = TD3(**kwargs) self.use_gp = use_gp if self.use_gp: assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR' self.n_train = gp_params['n_train'] self.retraining_iter = gp_params['training_iter'] self.cvar_limit = gp_params['cvar_limit'] self.gp_limit = gp_params['gp_limit'] self.likelihood = gpytorch.likelihoods.GaussianLikelihood() if 'data' in gp_params.keys(): self.X_train = gp_params['data']['X_train'] self.y_train = gp_params['data']['y_train'] else: self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features self.y_train = torch.zeros(self.n_train) self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood) self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp) self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1) self.shares = 0 self.cash = 0 self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned # for plotting self.pred_return = 0 self.pred_lower = 0 self.pred_upper = 0 # for debugging self.goal_num_shares = 0 def learn(self, n_steps): # when using gp, load pretrained rl agent - no need to train if self.use_gp: # train GP using fixed number of steps self.__train_gp(100) else: # train RL agent self.rl.learn(n_steps) def predict(self, obs, deterministic): action, state = self.rl.predict(obs, deterministic=deterministic) if self.use_gp: # slightly retrain self.__train_gp(self.retraining_iter, retrain=True) # predict next step returns and CI using GP with torch.no_grad(), gpytorch.settings.fast_pred_var(): output = self.gp(torch.Tensor(obs[2:])[None]) obs_pred = self.likelihood(output) f_mean = output.mean.detach().numpy()[0] self.pred_return = f_mean.item() f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy() lower, upper = obs_pred.confidence_region() self.pred_lower = lower.item() self.pred_upper = upper.item() rl_action = action action -= ACTION_OFFSET # adjust from action for env to see actual trade # adjust trade size given prediction # if self.shares > 0: # long position if f_mean > self.gp_limit: # predict positive return over certain threshold tail_samples = f_samples[f_samples < lower.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.cvar_limit // ps_cvar else: goal_num_shares = self.shares + action # positive return for long - no adjustment needed action = min(10, max(0, goal_num_shares - self.shares)) elif f_mean < -self.gp_limit: tail_samples = f_samples[f_samples > upper.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.shares + action # negative return for short - no adjustment needed else: goal_num_shares = self.cvar_limit // ps_cvar action = max(-10, min(0, goal_num_shares - self.shares)) else: goal_num_shares = self.shares + action # print(ps_cvar, lower.item(), upper.item()) # if not np.isnan(goal_num_shares): self.goal_num_shares = goal_num_shares # if action > 0: # buy order # action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction # # print(goal_num_shares - self.shares, action) # elif action < 0: # sell order # action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2 # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET) return action, state def update(self, obs, reward=None): self.obs.append(obs) self.shares, self.cash = obs[:2] if reward is not None: self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:] self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:] # print(self.X_train, self.y_train) self.gp.set_train_data(self.X_train, self.y_train) def save(self, rl_path, gp_path=None): self.rl.save(rl_path) if gp_path is not None: torch.save(self.gp.state_dict(), gp_path) def load(self, rl_path, gp_path=None): self.rl = A2C.load(rl_path) if gp_path is not None: state_dict = torch.load(gp_path) self.gp.load_state_dict(state_dict) def __train_gp(self, n_iter, retrain=False): # train GP using fixed number of steps self.gp.train() self.likelihood.train() for i in range(n_iter): output = self.gp(self.X_train) loss = -self.mll(output, self.y_train) self.opt.zero_grad() loss.backward() self.opt.step() self.gp.eval() self.likelihood.eval()
def key_handler(event): """ Accepts a key event and makes an appropriate decision. :param event: Key event :return: void """ global _root global _routing_canvas global _rl_model global _is_first_step global _rl_env global _rl_target_cell global _step_count global LEARN_RATE global EXPLORE_INIT global EXPLORE_FINAL global GAMMA global TRAIN_TIME_STEPS global LOAD_MODEL_NAME e_char = event.char if e_char == 'l': # RL Agent Learning pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() # RL Agent _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT, exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA) print("Beginning RL training") _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS) print("Finished RL training") print("Saving trained model") _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S")) elif e_char == 't': # RL Agent Testing pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() print("Loading trained model") if _rl_model is None: _rl_model = DQN.load(LOAD_MODEL_NAME) obs = _rl_env.reset() done = False while not done: rl_action, states = _rl_model.predict(obs, deterministic=True) print("Action " + str(rl_action)) obs, rewards, done, info = _rl_env.step(rl_action) elif e_char == 'r': # RL flow debugging (no agent involved, emulate actions randomly) if _is_first_step: _rl_env.reset() _is_first_step = False else: rand_action = random.randrange(1) rl_action_step(rand_action) else: pass
target_update_interval=10000, exploration_fraction=0.5, exploration_initial_eps=1.0, exploration_final_eps=0.05, max_grad_norm=10, tensorboard_log=log_dir, create_eval_env=False, policy_kwargs=None, verbose=2, seed=None, device='cuda', _init_setup_model=True) model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS, tb_log_name=log_name, log_interval=1) model.save(model_name) if ALGORITHM == 'PPO': from stable_baselines3.ppo import MlpPolicy model = PPO(MlpPolicy, env, tensorboard_log=log_dir, verbose=2) model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS, tb_log_name=log_name, log_interval=1) model.save(model_name) else: print('!ERROR: incorrect algorithm selection!') del model else:
exploration_fraction=0.4) #env_eval = Monitor(env, './logs/') eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=1000, deterministic=True, render=False) #Deeper NN #model = DQN.load("DQN", env=env) model.learn(total_timesteps=5_000_000, callback=eval_callback) # Typically not enough model.save("DQN") #model = DQN.load("DQN", env=env) model = DQN.load("logs/best_model", env=env) #model = PPO.load("PPO_discrete", env=env) logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS), num_drones=ARGS.num_drones) obs = env.reset() start = time.time() n_trial = 0 for i in range(ARGS.duration_sec * env.SIM_FREQ): if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0: action, _states = model.predict( obs, deterministic=True, )
def learn_with_selfplay(max_agents, num_learn_steps, num_learn_steps_pre_training, num_eval_eps, num_skip_steps=0, model_name='dqn', only_rule_based_op=False, patience=5, image_observations=True, output_folder="output", fine_tune_on=None, opponent_pred_obs=False, adversarial_training=None, save_freq=None): """ Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints. :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training successfully created an improved agent. :param num_learn_steps: Number of frames / steps for every learning iteration :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved compared to previous version :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be saved as madel_namei, where i is the training iteration. :param only_rule_based_op: If set to true training is only performed on the rule-based agent. :param patience: Patience parameter for evaluation :param image_observations: Use image instead of feature observations :param output_folder: Root folder for outputs :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as string to this parameter :param opponent_pred_obs: If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments this didn't improve the adversarial policy :param adversarial_training: If set to True perform adversarial training using FGSM during training. :param save_freq: If not None save intermediate checkpoints during training with the given frequency :return: """ eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations, num_skip_steps, opponent_pred_obs, adversarial_training) # If fine tuning, load model to fine-tune from path if fine_tune_on is not None: path = Path(output_folder) / 'models' / fine_tune_on fine_tune_model = DQN.load(path) fine_tune_model.tensorboard_log = None if opponent_pred_obs: # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being # fine-tuned against, instead of the rule-based agent eval_op = fine_tune_model eval_env_rule_based.set_opponent(eval_op) eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based) eval_env.set_opponent(eval_op) eval_env = OpponentPredictionObs(eval_env) else: fine_tune_model = None # Initialize first agent pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based) previous_models = [pre_train_agent] # Load potentially saved previous models for opponent_id in range(1, max_agents): path = _make_model_path(output_folder, model_name, opponent_id) if os.path.isfile(path): model = DQN.load(path) previous_models.append(model) else: break # Initialize first round last_agent_id = len(previous_models) - 1 prev_num_steps = 0 patience_counter = 0 tb_path = Path(output_folder) / "tb-log" if last_agent_id == 0: # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0, # tensorboard_log="output/tb-log") # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log") # , exploration_fraction=0.3) main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path) # , exploration_fraction=0.3) else: main_model = copy.deepcopy(previous_models[last_agent_id]) main_model.set_env(train_env) main_model.tensorboard_log = tb_path # Start training with self-play over several rounds opponent_id = last_agent_id while opponent_id < max_agents - 1: print(f"Running training round {opponent_id + 1}") if fine_tune_on is None: # Choose opponent based on setting if only_rule_based_op: current_train_env = train_env_rule_based # Use rule-based as opponent current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env)) else: if opponent_id == 0: current_train_env = train_env_rule_based else: current_train_env = train_env # Take opponent from the previous version of the model current_train_env.set_opponent(previous_models[opponent_id]) else: # Use passed fine-tune agent as opponent current_train_env = train_env current_train_env.set_opponent(fine_tune_model) # Train the model current_train_env.set_opponent_right_side(True) chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps # Iteration 0 is pre-training # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is # currently being trained if adversarial_training is not None: current_train_env.env.victim_model = main_model # Optionally add a callback to save intermediate checkpoints if save_freq is not None: checkpoint_callback = CheckpointCallback(save_freq=save_freq, save_path='./output/intermediate/', name_prefix=model_name + str(opponent_id + 1) + '_interm') else: checkpoint_callback = None # === LEARNING === main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback) # Do evaluation for this training round eval_env_rule_based.set_opponent(eval_op) avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps) print(model_name) print(f"Average round reward after training: {avg_round_reward}") print(f"Average number of steps per episode: {num_steps / num_eval_eps}") # Check if there was improvement if num_steps > prev_num_steps: # Model improved compared to last print('Model improved') prev_num_steps = num_steps # Reset patience counter patience_counter = 0 # Save the further trained model to disk main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1)) # Make a copy of the just saved model by loading it copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1)) # Save the copy to the list previous_models.append(copy_of_model) # From here we continue training the same main_model against itself opponent_id += 1 else: print('Model did not improve') patience_counter += 1 # Do not save the model if patience_counter > patience: print('Stopping early due to patience') break # Because our model did not improve compared to the previous one, we reset our main_model to the previous one main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id)) main_model.set_env(train_env) # Opponent does not change if not opponent_pred_obs: # Evaluate the last model against each of its previous iterations # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps) pass # Not useful right now
model = DQN(MlpPolicy, env, verbose=1) #model = DQN(MlpPolicy, env, seed=1423, target_update_interval =5, batch_size=16, train_freq=128, buffer_size=256, gamma=0.95, learning_rate=1e-3, verbose=1) print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=4) print("end model learning !") print("-> model saved !!") model.save("dqn_cartpole") print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("-> model evaluation without learning") print( f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}" ) print("-> model evaluation with learning") print( f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}"
eval_log_path = "eval_logs/dqn_evolve_rl_eval_{}_{}_{}_{}".format(env_name, loss_type, seed, time_int) eval_callback = EvalCallback(eval_env, log_path=eval_log_path, eval_freq=eval_freq, deterministic=True, render=False, n_eval_episodes=5) if env_name == 'MountainCar-v0': buffer_size = 10000 # max(total_timesteps // 100, 500) learning_starts = 1000 # max(total_timesteps // 1000, 100) learning_rate = 4e-3 batch_size = 128 gamma = 0.98 train_freq = 16 target_update_interval = 600 gradient_steps = 8 exploration_fraction = 0.2 # (learning_starts + 1000)/total_timesteps exploration_final_eps = 0.07 model = DQN(MlpPolicy, env, policy_kwargs=policy_kwargs, target_update_interval=target_update_interval, exploration_fraction=exploration_fraction, buffer_size=buffer_size, train_freq=train_freq, learning_starts=learning_starts, seed=seed, tensorboard_log=tensorboard_log, verbose=1, loss_type=loss_type, dqn_reg_loss_weight=dqn_reg_loss_weight, batch_size=batch_size, learning_rate=learning_rate, gamma=gamma, gradient_steps=gradient_steps, exploration_final_eps=exploration_final_eps) else: model = DQN(MlpPolicy, env, policy_kwargs=policy_kwargs, target_update_interval=target_update_interval, exploration_fraction=exploration_fraction, buffer_size=buffer_size, train_freq=train_freq, learning_starts=learning_starts, seed=seed, tensorboard_log=tensorboard_log, verbose=1, loss_type=loss_type, dqn_reg_loss_weight=dqn_reg_loss_weight) model.learn(total_timesteps=total_timesteps, log_interval=100, callback=eval_callback) model.save(model_save_name)