def create_model(self, n_envs=1): """ Create env and agent model """ env_cls = SprEnv self.env = make_vec_env(env_cls, n_envs=n_envs, env_kwargs={"params": self.params}, seed=self.params.seed) self.model = ACKTR( self.policy, self.env, gamma=self.params.agent_config['gamma'], n_steps=self.params.agent_config['n_steps'], ent_coef=self.params.agent_config['ent_coef'], vf_coef=self.params.agent_config['vf_coef'], vf_fisher_coef=self.params.agent_config['vf_fisher_coef'], max_grad_norm=self.params.agent_config['max_grad_norm'], learning_rate=self.params.agent_config['learning_rate'], gae_lambda=self.params.agent_config['gae_lambda'], lr_schedule=self.params.agent_config['lr_schedule'], kfac_clip=self.params.agent_config['kfac_clip'], kfac_update=self.params.agent_config['kfac_update'], async_eigen_decomp=self.params.agent_config['async_eigen_decomp'], verbose=self.params.agent_config['verbose'], tensorboard_log="./tb/acktr/", seed=self.params.seed, policy_kwargs={"params": self.params})
def fed_and_eval(base_index, w): base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") base_parameter_dict = base_agent.get_parameters() sub_model_parameters = [] for subenv in subenv_dict.values(): client_policy = ACKTR.load( f"./base{base_index}_client_model/{subenv}/policy.zip") sub_model_parameters.append(client_policy.get_parameters()) aligned_agent = base_agent base_parameter_dict = aligned_agent.get_parameters() model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha) aligned_agent.load_parameters(base_parameter_dict) avg_reward, reward_std = evaluate_policy(aligned_agent, base_env, n_eval_episodes=100) print(f"base {base_index}, weight {w} done") return (avg_reward, reward_std)
def load_model(self, path=None): """ Load the model from a zip archive """ if path is not None: self.model = ACKTR.load(path) else: self.model = ACKTR.load(self.params.model_path) # Copy the model to the new directory self.model.save(self.params.model_path)
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) model = ACKTR(CnnPolicy, env, nprocs=num_cpu) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def run(): torch.multiprocessing.freeze_support() env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def acktr(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = ACKTR(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with ACKTR.") model.learn(total_timesteps=timesteps) env.close()
def test_action_mask_run_acktr(vec_env, policy, env_class): env = vec_env([env_class]) model = ACKTR(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def get_intrinsic_reward(base_index): intrinsic_rewards = [[] for _ in range(len(subenv_dict))] # base env base_name = subenv_dict[base_index] base_env = make_vec_env(f"selected-bipedal-{base_name}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip") # rnd model rnd_dict = {} for client_env in subenv_dict.values(): rnd = RandomNetworkDistillation(input_size=24) rnd.load(f"./base{base_index}_client_model/{client_env}/rnd") rnd_dict[client_env] = rnd obs = base_env.reset() for _ in range(num_test): for i, client_env in subenv_dict.items(): intrinsic_rewards[i].append( rnd_dict[client_env].get_intrinsic_reward(obs)) action = base_agent.predict(obs) obs, reward, done, info = base_env.step(action[0]) if done: obs = base_env.reset() return intrinsic_rewards
def NewPotential(current_window, algorithm='PPO'): # Determine the pretrained agent if algorithm == 'A2C': model = A2C.load("pretrained_A2C") elif algorithm == 'PPO': model = PPO2.load("pretrained_PPO") elif algorithm == 'ACKTR': model = ACKTR.load("pretrained_ACKTR") elif algorithm == 'ACER': model = ACER.load("pretrained_ACER") else: raise ValueError("%s is not a valid algorithm." % algorithm) if len(current_window) != model.observation_space.shape[0]: raise ValueError("%s is does not match the model's window size." % len(current_window)) action, _states = model.predict(current_window, deterministic=False) voltages = np.linspace(0, 1, num=model.action_space.n) if action >= 0 and action <= model.action_space.n - 1: voltage = voltages[action] else: raise ValueError( "Received invalid action={} which is not part of the action space". format(action)) return voltage
def make_new_model(model_type, policy, env, tensorboard_log=None): if model_type.lower() == 'dqn': model = DQN(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'ppo2': model = PPO2(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'a2c': model = A2C(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'acktr': model = ACKTR(policy, env, tensorboard_log=tensorboard_log) return model
def run_illegal_move_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'Illegal_move_prevention_training' else: writer = None tb_log_name = None if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json') env = gym.make('plark-env-illegal-move-v0') if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model env = gym.make('plark-env-illegal-move-v0') model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def run_sonobuoy_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'sonobuoy_training' else: writer = None tb_log_name = None env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/') if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def loadAgent(self, filepath, algorithm_type): try: if algorithm_type.lower() == 'dqn': self.model = DQN.load(filepath) elif algorithm_type.lower() == 'ppo2': self.model = PPO2.load(filepath) elif algorithm_type.lower() == 'a2c': self.model = A2C.load(filepath) elif algorithm_type.lower() == 'acktr': self.model = ACKTR.load(filepath) except: raise ValueError('Error loading pelican agent. File : "' + filepath + '" does not exsist')
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_acktr(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params) print("DOING LEARING acer") original_env.force_progression = False model.learn(int(2e4), seed=seed) print("DONE LEARING acer") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def train_acktr(seed): """ test ACKTR on the uav_env(cartesian,discrete) """ """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False) """ algo = 'ACKTR' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo), _init_setup_model=True) # , async_eigen_decomp=False) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACKTR.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def loadAgent(self, filepath, algorithm_type): try: if algorithm_type.lower() == "dqn": self.model = DQN.load(filepath) elif algorithm_type.lower() == "ppo2": self.model = PPO2.load(filepath) elif algorithm_type.lower() == "ppo": self.model = PPO.load(filepath) elif algorithm_type.lower() == "a2c": self.model = A2C.load(filepath) elif algorithm_type.lower() == "acktr": self.model = ACKTR.load(filepath) except: raise ValueError('Error loading panther agent. File : "' + filepath + '" does not exsist')
def stable_baseline_test(env_origin): env = make_vec_env(lambda: env_origin, n_envs=1) model = ACKTR('CnnPolicy', env_origin, verbose=1) model.learn(total_timesteps=2000000) print("Stable_baseline evaluation starts.....\n") #NOTE:evaluate_policy needs vec_env reward_mean, reward_std = evaluate_policy(model, env, n_eval_episodes=20, deterministic=False) print("mean reward:" + str(reward_mean) + '\n') print("reward std:" + str(reward_std) + '\n') print("custom evaluation begin\n") env = env_origin obs = env.reset() reward_list_total = [] epilen_list = [] reward_list = [] last_end = 0 for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward_list.append(rewards) if dones: obs = env.reset() epilen_list.append(i - last_end) last_end = i reward_list_total.append(np.sum(reward_list)) reward_list = [] if i > 900: break print("mean reward:{}\n".format(np.mean(reward_list_total))) print("mean epilen:{}\n".format(np.mean(epilen_list)))
def save_client(base_index, subenv_id): base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") subenv = subenv_dict[subenv_id] env = make_vec_env(f"selected-bipedal-{subenv}-v0", n_envs=n_envs, seed=seed) learner = base_agent learner.env = env learner.verbose = 0 callback = SaveRNDDatasetCallback(base_index=base_index) learner.learn( total_timesteps=client_timesteps, callback=callback, ) dir_name = f"base{base_index}_client_model/{subenv}" Path(dir_name).mkdir(parents=True, exist_ok=True) learner.save(f"{dir_name}/policy.zip") print(f"base {base_index} sub-env {subenv} done")
def eval_base_agent(agent_index): mean_result = [] std_result = [] agent = ACKTR.load(f"./base_agent/{subenv_dict[agent_index]}/model.zip") for env_index in range(4): env = gym.make(f"selected-bipedal-{subenv_dict[env_index]}-v0") env.seed = seed mean, std = evaluate_policy(agent, env, n_eval_episodes=100) mean_result.append(mean) std_result.append(std) Path("log").mkdir(parents=True, exist_ok=True) file = open(f"log/agent{agent_index}_simple_agent_test.csv", "w", newline="") writer = csv.writer(file) writer.writerow(mean_result) writer.writerow(std_result) file.close() print(f">>> Agent {agent_index}:") print(mean_result) print(std_result) return
def optimize_agent(trial): agent = PPO2 policy = MlpLstmPolicy train_env, test_env = optimize_envs(trial) if agent == ACKTR: params = optimize_acktr(trial) model = ACKTR(policy, train_env, verbose=1, tensorboard_log="./tensorboard", **params) elif agent == PPO2: params = optimize_ppo2(trial) model = PPO2(policy, train_env, verbose=1, nminibatches=1, tensorboard_log="./tensorboard", **params) model.test_env = test_env model.trial = trial try: model.learn(n_timesteps, callback=learn_callback) model.env.close() test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned # pylint: disable=no-member cost = -1 * model.last_mean_test_reward # pylint: disable=no-member del model.env, model.test_env del model if is_pruned: raise optuna.structs.TrialPruned() return cost
import numpy as np from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO from stable_baselines.ddpg import NormalActionNoise from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, n_steps=1, replay_ratio=1).learn(total_timesteps=15000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, max_kl=0.05, lam=0.7).learn(total_timesteps=10000), } @pytest.mark.slow @pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) def test_identity(model_name): """
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("acktr_cartpole") del model # remove to demonstrate saving and loading model = ACKTR.load("acktr_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
from stable_baselines import A2C, ACER, ACKTR, DeepQ, DDPG, PPO1, PPO2, TRPO from stable_baselines.ddpg import AdaptiveParamNoiseSpec from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'deepq': lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG). learn(total_timesteps=1000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), } @pytest.mark.slow
#! /usr/bin/env python import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) from env import GoLeftEnv from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy env = GoLeftEnv(grid_size=10) env = make_vec_env(lambda: env, n_envs=1) model = ACKTR.load("models/acktr_goleft", env=env) obs = env.reset() n_steps = 20 for step in range(n_steps): action, _ = model.predict(obs, deterministic=True) print("Step {}".format(step + 1)) print("Action: ", action) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) if done: # Note that the VecEnv resets automatically # when a done signal is encountered print("Goal reached!", "reward=", reward) break
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
# extension: .py # format_name: light # format_version: '1.4' # jupytext_version: 1.2.4 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # + import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=250000) # - obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
envTmp = gym.make('Battleships-v0', config=config) #Wrap environment into a vector environment env = DummyVecEnv([lambda: envTmp]) # Choose to display board print("Diplay board: Yes (1), No (0)") choiceRender = bool(int(input())) # Choose Model randomAgent = True print("Choose Agent: Radom (1), ACKTR (2), DQN (3)") choice = int(input()) if choice == 2: # Load ACKTR Model model = ACKTR.load("./ACKTR_Models/ACKTR_5x5_3_2_2_Dynamic.zip", verbose=0, env=env) # Disable Random Agent randomAgent = False elif choice == 3: # load DQN Model model = DQN.load("./DQN_Models/DQN_5x5_3_2_2_Dynamic.zip", verbose=0, env=env) # Disable Random Agent randomAgent = False # Inits result Array results = [] # Iteration: Amount of played Games for iteration in range(10): score = 0 print('Iteration', iteration)
elif args.model == 'acer': model = ACER(policy, env, verbose=1, n_steps=64, tensorboard_log=out_dir) elif args.model == 'ppo': model = PPO2(policy, env, verbose=1, n_steps=64, tensorboard_log=out_dir) elif args.model == 'acktr': model = ACKTR(policy, env, n_steps=4, verbose=1, tensorboard_log=out_dir) elif args.model == 'ddpg': model = DDPG(policy, env, verbose=1, tensorboard_log=out_dir) elif args.model == 'a2c': model = A2C(policy, env, n_steps=64, verbose=1, tensorboard_log=out_dir) elif args.model == 'sac': model = SAC("CnnPolicy", env) train(model, env, out_dir) else: #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl")
#! /usr/bin/env python import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) import env_yaw from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy env = gym.make("Yaw-v0") env = make_vec_env(lambda: env, n_envs=1) # model = ACKTR.load("models/acktr_goleft", env=env) model = ACKTR('MlpPolicy', env, verbose=1) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
if (n_steps + 1) % 100000 == 0: print("Saving checkpoint model") _locals['self'].save(model_dir + 'model_{}_steps.pkl'.format(n_steps + 1)) n_steps += 1 return True print('Starting Training') """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False) """ model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True) model.learn(total_timesteps=num_timesteps, callback=custom_callback, seed=seed, log_interval=100) print('Starting evaluation') env = setup_env_cart_discrete(seed, log_dir) model.set_env(env) get_trajectories(model, trajectory_dir, n_trajectories=100)